In [29]:
import pandas as pd
import numpy as np
from collections import deque
pd.set_option('display.max_rows', None)

In [2]:
class DecisionNode:
    def __init__(self, num_samples):
        self.num_samples = num_samples
        
        self.split_feature = None
        self.split_value = None
        self.gini = None
        self.left_child = None 
        self.right_child = None
        self.class_label = None 
        self.is_leaf = False

    def __str__(self):
        return f"NumSamples: {self.num_samples}" \
        f"\nLabel: {self.class_label}" \
        f"\nGini: {self.gini}" \
        f"\nColumnToSplit: {self.split_feature}" \
        f"\nSplitValue: {self.split_value}" \
        f"\nLeaf: {self.is_leaf}\n"

In [3]:
class DecisionTree:
    def __init__(self, max_depth):
        self.root_node = None
        self.num_nodes = 0
        self.max_depth = max_depth
        self.node_count = 0

        self.train = None
        self.labels = None
        
    def _gini(self, df):
        """Returns gini score of df"""
        t = len(df)
        p = df.groupby(self.target_feature).size() / t
        return 1 - sum(p**2)

    def fit(self, X):
        """
        Builds the decision tree from the given training samples and their 
        respective labels

        : array X: A dataframe of training samples
        : array y: An series representing the labels for each sample

        """
        self.train = X 
        self.root = DecisionNode(len(X))

        # Recursive step
        self._split_node(self.root, 0)

    def _split_node(self, node, current_lvl):
        self.node_count += 1
        
        # Find the CURRENT gini-impurity of samples within this node.
        node.gini = self._gini(df)

        # Find the most common LABEL of the current samples within this node.
        node.class_label = df[self.target_feature].mode()[0]
        
        # Decide whether to keep growing the Decision Tree 
        if(current_level < self.max_depth):
            current_level += 1

            node.is_leaf = False
            # What this node's best option to split on is.
            node.split_feature = feature
            
            # Given the feature to split on, what is the best value WITHIN that 
            # column to possible split around.
            node.split_value = self._split_value(df, feature)

            left_df = df[df[feature] == node.split_value]  # left == true
            right_df = df[df[feature] != node.split_value]  # right == false
            
            if(len(left_df) > 0):
                node.left_child = DecisionNode(len(left_df))
                self._splitNode(left_df, node.left_child, current_level)
            if(len(right_df) > 0):
                node.right_child = DecisionNode(len(right_df))
                self._splitNode(right_df, node.right_child, current_level)
            return
        else:
            node.is_leaf = True
            return

## Testing on Iris Dataset

In [2]:
import sklearn
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [82]:
iris = sklearn.datasets.load_iris()

cols = iris['feature_names']
X = iris['data'] # feature data
y = iris['target'] # What flower those features belong to

data = pd.DataFrame(X, columns=cols)
data['label'] = y
data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
5,5.4,3.9,1.7,0.4,0
6,4.6,3.4,1.4,0.3,0
7,5.0,3.4,1.5,0.2,0
8,4.4,2.9,1.4,0.2,0
9,4.9,3.1,1.5,0.1,0


In [99]:
test = data.iloc[:, 0] >6
#test = sample_space.iloc[:, col] < avg
test

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
30     False
31     False
32     False
33     False
34     False
35     False
36     False
37     False
38     False
39     False
40     False
41     False
42     False
43     False
44     False
45     False
46     False
47     False
48     False
49     False
50      True
51      True
52      True
53     False
54      True
55     False
56      True
57     False
58      True
59     False
60     False
61     False
62     False
63      True
64     False
65      True
66     False
67     False
68      True
69     False
70     False
71      True
72      True
73      True
74      True
75      True
76      True

In [102]:
labels = data.iloc[test.values, -1]
labels

50     1
51     1
52     1
54     1
56     1
58     1
63     1
65     1
68     1
71     1
72     1
73     1
74     1
75     1
76     1
77     1
86     1
87     1
91     1
97     1
100    2
102    2
103    2
104    2
105    2
107    2
108    2
109    2
110    2
111    2
112    2
115    2
116    2
117    2
118    2
120    2
122    2
123    2
124    2
125    2
126    2
127    2
128    2
129    2
130    2
131    2
132    2
133    2
134    2
135    2
136    2
137    2
139    2
140    2
141    2
143    2
144    2
145    2
146    2
147    2
148    2
Name: label, dtype: int32

In [103]:
test.values

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True,  True,  True, False,
        True, False,  True, False,  True, False, False, False, False,
        True, False,  True, False, False,  True, False, False,  True,
        True,  True,  True,  True,  True,  True, False, False, False,
       False, False, False, False, False,  True,  True, False, False,
       False,  True, False, False, False, False, False,  True, False,
       False,  True, False,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True, False, False,  True,  True,
        True,  True, False,  True, False,  True,  True,  True,  True,
        True,  True,

In [104]:
test

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
30     False
31     False
32     False
33     False
34     False
35     False
36     False
37     False
38     False
39     False
40     False
41     False
42     False
43     False
44     False
45     False
46     False
47     False
48     False
49     False
50      True
51      True
52      True
53     False
54      True
55     False
56      True
57     False
58      True
59     False
60     False
61     False
62     False
63      True
64     False
65      True
66     False
67     False
68      True
69     False
70     False
71      True
72      True
73      True
74      True
75      True
76      True

In [90]:
bigger5 = data['sepal length (cm)'] > 6
bigger5

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
30     False
31     False
32     False
33     False
34     False
35     False
36     False
37     False
38     False
39     False
40     False
41     False
42     False
43     False
44     False
45     False
46     False
47     False
48     False
49     False
50      True
51      True
52      True
53     False
54      True
55     False
56      True
57     False
58      True
59     False
60     False
61     False
62     False
63      True
64     False
65      True
66     False
67     False
68      True
69     False
70     False
71      True
72      True
73      True
74      True
75      True
76      True

In [97]:
biglabel = data.iloc[bigger5, -1]
#df_low = df.loc[df["salary"]<6000,"salary"]

NotImplementedError: iLocation based boolean indexing on an integer type is not available

In [87]:
feature_values = data.iloc[:, 0].to_numpy()
feature_values = np.unique(feature_values)
feature_values

array([4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5. , 5.1, 5.2, 5.3, 5.4, 5.5,
       5.6, 5.7, 5.8, 5.9, 6. , 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8,
       6.9, 7. , 7.1, 7.2, 7.3, 7.4, 7.6, 7.7, 7.9])

In [23]:
print(type(bigger5))
print(len(bigger5))
print(len(data))

<class 'pandas.core.series.Series'>
150
150


In [26]:
rebirth = data.loc[bigger5]

In [31]:
data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
5,5.4,3.9,1.7,0.4,0
6,4.6,3.4,1.4,0.3,0
7,5.0,3.4,1.5,0.2,0
8,4.4,2.9,1.4,0.2,0
9,4.9,3.1,1.5,0.1,0


In [27]:
len(rebirth)

61

In [35]:
allrows = pd.Series([True for i in range(len(bigger5))])

In [36]:
print(allrows)

0      True
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20     True
21     True
22     True
23     True
24     True
25     True
26     True
27     True
28     True
29     True
30     True
31     True
32     True
33     True
34     True
35     True
36     True
37     True
38     True
39     True
40     True
41     True
42     True
43     True
44     True
45     True
46     True
47     True
48     True
49     True
50     True
51     True
52     True
53     True
54     True
55     True
56     True
57     True
58     True
59     True
60     True
61     True
62     True
63     True
64     True
65     True
66     True
67     True
68     True
69     True
70     True
71     True
72     True
73     True
74     True
75     True
76     True
77     True
78     True
79     True
80     True
81     True
82     True
83  

In [None]:
df_low = df.loc[df["salary"]<6000,"salary"]

In [None]:
cols = voting.columns
mask = voting[cols[1]] == 'n'

print(type(mask))

def impurity_of_feature(self, feature):
    """
    Find the best value within a feature vector to split on.
    """
    # Sort values in ascending order
    in_order = feature.sort_values()
    
    # Compute the gini impurity for each in between average
    for i in range(1, len(feature)):
        avg = in_order + in_order[i - 1] / 2
        
        # get gini score of the values below and values ABOVE the average. 
        # need y_labels that correspond to the x-vector values.
        
        # score below avg:
        mask = self.train[feature] < avg
        y_below
        #gini_impurity(self, y):