In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
# Rule format is (k, (condition))
class DecisionTree():
    def __init__(self, rootRule):
        self.rootRule = rootRule
        self.leftBranch = None
        self.rightBranch = None
class Branch():
    def __init__(self, rule):
        self.rule = rule
        self.leftBranch = None
        self.rightBranch = None

def evaluateFeature(df, label, feature):
    focusDf = df.loc[:,[feature, label]] # For now label name is hard coded, but we could have label feature parameter
    minVal = focusDf[feature].min()
    maxVal = focusDf[feature].max()
    bestGini = 1
    bestSplitVal = 0
    step = (maxVal - minVal)/15 # Evaluate 15 different values for split condition
    for i in range(1, 15): # We dont care about min and max split, in my opinion
        currentSplit = minVal + step * i
        leftSplit = focusDf.loc[(focusDf[feature] <= currentSplit)]
        rightSplit = focusDf.loc[(focusDf[feature] > currentSplit)]
        giniIndexLeft = 1
        giniIndexRight = 1
        #for left split
        for value in leftSplit[label].unique():
            #print(leftSplit.loc[(leftSplit["species"] == value)]["species"])
            classCount = leftSplit.loc[(leftSplit[label] == value)][label].count()
            totalCount = len(leftSplit)
            giniIndexLeft -= pow(classCount/totalCount,2) # gini calculation for left
        #for right split
        for value in rightSplit[label].unique():
            classCount = rightSplit.loc[(rightSplit[label] == value)][label].count()
            totalCount = len(rightSplit)
            giniIndexRight -= pow(classCount/totalCount,2) # gini calculation for right
        giniIndex = (giniIndexLeft + giniIndexRight)/2 # Get average of gini index
        #print(giniIndex)
        if giniIndex < bestGini:
            bestSplitVal = currentSplit
            bestGini = giniIndex
    #print(f"Gini: {bestGini} feature: {feature} split: {bestSplitVal}")
    return (feature, bestSplitVal, bestGini)

def branch(df, YLabel, k, n, verbose = True):
    if verbose:
        print(f"Branch on level {k} has df of size: {len(df)}")
    if k == n:
        # Reached bottom level, is not going to branch further
        #print(len(df))
        #print(df[YLabel].value_counts())
        label = df[YLabel].value_counts().index[0] # First index is gonna be the one with highest count in df
        thisBranch = Branch((k, ('END', label)))
        return thisBranch
    giniIndexMain = 1
    for value in df[YLabel].unique():
        classCount = df.loc[(df[YLabel] == value)][YLabel].count()
        totalCount = len(df)
        giniIndexMain -= pow(classCount/totalCount,2) # gini calculation for left
    if giniIndexMain == 0:
        # Branch is pure, no need for further branching
        label = df[YLabel].value_counts().index[0] # First index is gonna be the one with highest count in df
        thisBranch = Branch((k, ('END', label)))
        return thisBranch
    bestFeature = evaluateFeature(df, YLabel, df.columns[0])
    for col in df.columns[1:-1]: # Exclude first feature and last feature which is label
        result = evaluateFeature(df, YLabel, col)
        if result[2] < bestFeature[2]:
            bestFeature = result
    new_df1 = df.loc[(df[bestFeature[0]] <= bestFeature[1])]
    new_df2 = df.loc[(df[bestFeature[0]] > bestFeature[1])]
    thisBranch = Branch((k,bestFeature))
    if len(new_df1) == 0 or len(new_df2) == 0: # Check if new branch isnt empty 
        # Probably not gonna find better condition so we should end branching
        label = df[YLabel].value_counts().index[0] # First index is gonna be the one with highest count in df
        thisBranch = Branch((k, ('END', label)))
        return thisBranch
    thisBranch.leftBranch = branch(new_df1, YLabel, k + 1, n, verbose=verbose)  
    thisBranch.rightBranch = branch(new_df2, YLabel, k + 1, n, verbose=verbose)
    return thisBranch

def DecisionTree_train(df, YLabel, n_levels = 3, verbose = True):
    # Get first feature and score
    k = 0
    bestFeature = evaluateFeature(df, YLabel, df.columns[0])
    for col in df.columns[1:-1]: # Exclude first feature and last feature which is label
        result = evaluateFeature(df, YLabel, col)
        if result[2] < bestFeature[2]:
            bestFeature = result
    #Split based on best feature into 2 new branches
    new_df1 = df.loc[(df[bestFeature[0]] <= bestFeature[1])]
    new_df2 = df.loc[(df[bestFeature[0]] > bestFeature[1])]
    tree = DecisionTree((k,bestFeature))
    #print(f"Gini: {bestFeature[2]} feature: {bestFeature[0]} split: {bestFeature[1]}")
    tree.leftBranch = branch(new_df1, YLabel, k + 1, n_levels, verbose=verbose)
    tree.rightBranch = branch(new_df2, YLabel, k + 1, n_levels, verbose=verbose)
    return tree
# Recursive function, takes df, iterates over all rows, returns list of predicted labels
def DecisionTree_predict(df, model : DecisionTree):
    firstRule = model.rootRule[1] # Get first condition 
    #print(firstRule)
    predictions = []
    for row in df.iterrows(): # Iterrows returns pairs of (index, Series)
        prediction = -1
        if row[1][firstRule[0]] <= firstRule[1]:
            prediction = branch_predict(row, model.leftBranch) # Go to left branch
        else:
            prediction = branch_predict(row, model.rightBranch)# Go to right branch
        predictions.append(prediction)
    return predictions
def branch_predict(row, model : Branch):
    rule = model.rule[1]
    if rule[0] == 'END':
        return rule[1]
    if row[1][rule[0]] <= rule[1]:
        prediction = branch_predict(row, model.leftBranch) # Go to left branch
    else:
        prediction = branch_predict(row, model.rightBranch) # Go to right branch
    return prediction

In [3]:
class RandomForestClassifier():
    def __init__(self, n_estimators = 100, max_depth = 3):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.estimators = []
    def train(self, df, labelName):
        self.estimators = [] # Empty the estimators array in case model was trained before
        n_samples = int(len(df) * 0.8) # 80% of dataset
        for i in range(self.n_estimators):
            sample = df.sample(n=n_samples) # Create different sample for each decision tree
            estimator = DecisionTree_train(sample, labelName, n_levels= self.max_depth) # Train decision tree
            self.estimators.append(estimator)
    def predict(self, df, scoring = 'accuracy'):
        forestPredictions = []
        for model in self.estimators:
            treePredictions = DecisionTree_predict(df, model)
            forestPredictions.append(treePredictions)
        finalPredictions = []
        for i in range(len(df)): # for each row of predictions 
            helperDict = {}
            for j in range(len(forestPredictions)): # take a look at each prediction at row i and pick the one that has the highest occcurences
                currentValue = forestPredictions[j][i]
                if currentValue not in helperDict.keys():
                    helperDict[currentValue] = 1
                else:
                    helperDict[currentValue] += 1
            bestKey = -1
            bestValue = -1
            for key, value in helperDict.items(): # Loop through keys and they values which represent how many votes did the key get
                if value >= bestValue:
                    bestKey = key
                    bestValue = value
            finalPredictions.append(bestKey) # Pick key with highest votes
        return finalPredictions


In [4]:
df = pd.read_csv('./iris.csv', sep=';')
#Split df
train_X = df.sample(frac = 0.8)
test_X = df.drop(train_X.index)
test_X

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
7,5.0,3.4,1.5,0.2,0
10,5.4,3.7,1.5,0.2,0
14,5.8,4.0,1.2,0.2,0
24,4.8,3.4,1.9,0.2,0
44,5.1,3.8,1.9,0.4,0
47,4.6,3.2,1.4,0.2,0
55,5.7,2.8,4.5,1.3,1
56,6.3,3.3,4.7,1.6,1


In [5]:
randomForest = RandomForestClassifier(n_estimators=100, max_depth= 5)
randomForest.train(train_X, "species")

Branch on level 1 has df of size: 33
Branch on level 1 has df of size: 63
Branch on level 2 has df of size: 34
Branch on level 3 has df of size: 32
Branch on level 3 has df of size: 2
Branch on level 2 has df of size: 29
Branch on level 3 has df of size: 1
Branch on level 3 has df of size: 28
Branch on level 4 has df of size: 27
Branch on level 5 has df of size: 26
Branch on level 5 has df of size: 1
Branch on level 4 has df of size: 1
Branch on level 1 has df of size: 37
Branch on level 1 has df of size: 59
Branch on level 2 has df of size: 34
Branch on level 3 has df of size: 33
Branch on level 3 has df of size: 1
Branch on level 2 has df of size: 25
Branch on level 3 has df of size: 1
Branch on level 3 has df of size: 24
Branch on level 4 has df of size: 1
Branch on level 4 has df of size: 23
Branch on level 5 has df of size: 22
Branch on level 5 has df of size: 1
Branch on level 1 has df of size: 33
Branch on level 1 has df of size: 63
Branch on level 2 has df of size: 37
Branch on

In [6]:
predictions = randomForest.predict(test_X)

In [7]:
import sklearn.metrics
sklearn.metrics.accuracy_score(test_X.species, predictions)

0.8666666666666667

In [8]:
import sklearn.ensemble
train_X_sep, train_y_sep = train_X.loc[:, train_X.columns[:-1]], train_X.loc[:,train_X.columns[-1]]
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=100, max_depth=5)
rf.fit(train_X_sep, train_y_sep)
test_X_sep, test_y_sep = test_X.loc[:, test_X.columns[:-1]], test_X.loc[:,test_X.columns[-1]]
predictions = rf.predict(test_X_sep)
sklearn.metrics.accuracy_score(test_y_sep, predictions)

0.9333333333333333

In [9]:
df = pd.read_csv('./titanic_preprocessed.csv', sep=",", index_col='PassengerId')
df = df[[c for c in df if c not in ['Survived']] + ['Survived']] # Stick label as last column
#Split df
train_X = df.sample(frac = 0.8)
test_X = df.drop(train_X.index)
test_X

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5,3,0.0,35.000000,0,0,8.0500,7.0,0,0,1,0
9,3,1.0,27.000000,0,2,11.1333,7.0,0,0,1,1
10,2,1.0,14.000000,1,0,30.0708,7.0,1,0,0,1
11,3,1.0,4.000000,1,1,16.7000,6.0,0,0,1,1
12,1,1.0,58.000000,0,0,26.5500,2.0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
878,3,0.0,19.000000,0,0,7.8958,7.0,0,0,1,0
881,2,1.0,25.000000,0,1,26.0000,7.0,0,0,1,1
885,3,0.0,25.000000,0,0,7.0500,7.0,0,0,1,0
887,2,0.0,27.000000,0,0,13.0000,7.0,0,0,1,0


In [15]:
randomForest = RandomForestClassifier(n_estimators=100, max_depth= 5)
randomForest.train(train_X, "Survived")

Branch on level 1 has df of size: 565
Branch on level 2 has df of size: 564
Branch on level 3 has df of size: 563
Branch on level 4 has df of size: 560
Branch on level 5 has df of size: 556
Branch on level 5 has df of size: 4
Branch on level 4 has df of size: 3
Branch on level 3 has df of size: 1
Branch on level 2 has df of size: 1
Branch on level 1 has df of size: 3
Branch on level 1 has df of size: 564
Branch on level 2 has df of size: 563
Branch on level 3 has df of size: 560
Branch on level 4 has df of size: 557
Branch on level 5 has df of size: 553
Branch on level 5 has df of size: 4
Branch on level 4 has df of size: 3
Branch on level 3 has df of size: 3
Branch on level 2 has df of size: 1
Branch on level 1 has df of size: 4
Branch on level 1 has df of size: 566
Branch on level 2 has df of size: 565
Branch on level 3 has df of size: 564
Branch on level 4 has df of size: 560
Branch on level 5 has df of size: 556
Branch on level 5 has df of size: 4
Branch on level 4 has df of size: 

In [16]:
predictions = randomForest.predict(test_X)

In [17]:
sklearn.metrics.accuracy_score(test_X.Survived, predictions)

0.5730337078651685

In [20]:
train_X_sep, train_y_sep = train_X.loc[:, train_X.columns[:-1]], train_X.loc[:,train_X.columns[-1]]
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=100, max_depth=5)
rf.fit(train_X_sep, train_y_sep)
test_X_sep, test_y_sep = test_X.loc[:, test_X.columns[:-1]], test_X.loc[:,test_X.columns[-1]]
predictions = rf.predict(test_X_sep)
sklearn.metrics.accuracy_score(test_y_sep, predictions)

0.8258426966292135

In [14]:
df.Survived.value_counts()

Survived
0    549
1    340
Name: count, dtype: int64

# Porovnání výsledků
## Iris dataset
### Moje implementace
- Čas: 17.6s
- Accuracy: 0.867
### Sklearn implementace
- Čas: 0.2s
- Accuracy: 0.933
  
## Titanic dataset
### Moje implementace
- Čas: 33.2s
- Accuracy: 0.573
### Sklearn implementace
- Čas: 0.0s
- Accuracy: 0.826

# Možné zlepšení
- Převést pracování s listy na numpy arrays
- Chytřejší výběr hodnoty podle které se data rozdělí