In [1]:
import numpy as np
from collections import Counter

class Node:
  def __init__(self, feature=None, threshold=None,left=None, right=None,*,value=None):
    self.feature = feature
    self.threshold = threshold
    self.left = left
    self.right = right
    self.value = value

  def is_leaf_node(self):
    return self.value is not None

class DecisionTree:
  def __init__(self,min_samples_split=2, max_depth=100, n_features = None, root=None):
    self.min_samples_split = min_samples_split
    self.max_depth = max_depth
    self.n_features = n_features
    self.root = root

  def fit(self, X, y):
    self.n_features = X.shape[1] if not self.n_features else min(X.shape[1], self.n_features)
    self.root = self._grow_tree(X,y)

  def predict(self, X):
    return np.array([self._traverse_tree(x,self.root) for x in X])



  def _traverse_tree(self,x,node):
    if node.is_leaf_node():
      return node.value

    if x[node.feature] <= node.threshold:
      return self._traverse_tree(x, node.left)
    return self._traverse_tree(x, node.right)



  def _grow_tree(self,X,y,depth=0):
    n_samples, n_feats = X.shape
    n_labels = len(np.unique(y))

    #Stopping criteria Check.
    if (depth > self.max_depth or n_labels==1 or n_samples<self.min_samples_split):
      leaf_value = self._most_common_label(y)
      return Node(value=leaf_value)

    feat_idxs = np.random.choice(n_feats, self.n_features, replace=False)

    #Best split
    best_feature, best_thr = self._best_split(X, y, feat_idxs)

    #Child Nodes
    left_idxs, right_idxs = self._split(X[:,best_feature], best_thr)
    left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth+1)
    right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth+1)
    return Node(best_feature, best_thr, left, right)


  def _most_common_label(self,y):
    counter = Counter(y)
    value = counter.most_common(1)[0][0]
    return value

  def _best_split(self, X, y, feat_idxs):
    best_gain=-1
    split_idx, split_threshold = None, None

    for feat_idx in feat_idxs:
      X_column = X[:,feat_idx]
      thresholds = np.unique(X_column)

      for thr in thresholds:
         gain = self._information_gain(y, X_column, thr)

         if gain > best_gain:
          best_gain = gain
          split_idx = feat_idx
          split_threshold = thr

    return split_idx, split_threshold

  def _information_gain(self,y,X_column, thr):
    #Parent Entropy
    parent_entropy = self._entropy(y)


    #create Children
    left_idxs, right_idxs = self._split(X_column, thr) 
    
    if (len(right_idxs)==0 or len(left_idxs)==0):
      return 0

    #calculate child wt.avg Entropy
    n_l , n_r = len(left_idxs), len(right_idxs)
    e_l , e_r = self._entropy(y[left_idxs]), self._entropy(y[right_idxs])
    wt_child_entropy = (n_l/len(y)) * e_l  + (n_r/len(y)) * e_r
    #calculate gain
    return (parent_entropy - wt_child_entropy)

  def _entropy(self, y):
    hist = np.bincount(y)
    ps = hist / len(y)
    return -np.sum([p*np.log(p) for p in ps if p>0])

  def _split(self,X_column, thr):
    right_idxs = np.argwhere(X_column > thr).flatten()
    left_idxs = np.argwhere(X_column <= thr).flatten()
    return left_idxs, right_idxs


In [8]:
import numpy as np
from collections import Counter

class RandomForest:
  def __init__(self, max_depth=10, n_features=None, min_sample_splits=2, n_trees=10):
    self.max_depth = max_depth
    self.n_features = n_features
    self.min_sample_splits = min_sample_splits
    self.n_trees = n_trees
    self.trees = []

  def fit(self,X,y):
    self.trees = []
    for _ in range(self.n_trees):
      tree = DecisionTree(max_depth=self.max_depth,
                          n_features=self.n_features,
                          min_samples_split=self.min_sample_splits)
      X_sample, y_sample = self._bootstrap_samples(X,y)
      tree.fit(X_sample, y_sample)
      self.trees.append(tree)

  def _most_common_label(self,y):
    value = Counter(y).most_common(1)[0][0]
    return value

  def _bootstrap_samples(self,X,y):
    n_samples = X.shape[0]
    idxs = np.random.choice(n_samples, n_samples, replace=True)
    return X[idxs], y[idxs]

  def predict(self,X):
    predictions = np.array([tree.predict(X) for tree in self.trees])
    tree_preds = np.swapaxes(predictions, 0, 1)
    predictions = np.array([self._most_common_label(pred) for pred in tree_preds])
    return predictions

In [6]:
class Hyperparameter:
  def __init__(self,max_depth = [10], min_samples_split=[2], n_features=[], best_params=None):
    self.max_depth = max_depth
    self.min_samples_split = min_samples_split
    self.n_features = n_features
    self.best_params = best_params

  def tune(self, X, y):
    best_score = 0
    #Splitting
    X_train, X_test, y_train, y_test = self._splitting(X, y)

    #Best Score
    if len(self.n_features)==0 :
      for depth in self.max_depth:
        for mss in self.min_samples_split:
          tree = DecisionTree(max_depth=depth, min_samples_split=mss)
          tree.fit(X_train,y_train)
          y_pred = tree.predict(X_test)
          score = self._accuracy(y_test, y_pred)
          #Assigning.
          if score > best_score:
            best_params = {"max_depth" : depth, "min_samples_split" : mss}
    else:
       for depth in self.max_depth:
        for mss in self.min_samples_split:
          for n_feats in self.n_features:
            tree = DecisionTree(max_depth=depth, min_samples_split=mss, n_features=n_feats)
            tree.fit(X_train,y_train)
            y_pred = tree.predict(X_test)
            score = self._accuracy(y_test, y_pred)
            #Assigning.
            if score > best_score:
              best_score = score
              best_params = {"max_depth" : depth, "min_samples_split" : mss, "n_features":n_feats}
    return best_params, best_score

  def _splitting(self,X, y):
    a , b, c, d = train_test_split(X,y, test_size=0.2, random_state=42)
    return a, b, c, d

  def _accuracy(self,y_test, y_pred):
    return np.sum(y_test==y_pred)/len(y_test)

In [3]:
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [4]:
data = datasets.load_breast_cancer()
X, y = data.data, data.target

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
print(X_train.shape)

(455, 30)


In [8]:
depth = [10,11,12,13,14,15]
splits = [2,3,4,5,6]
n_f = [26,27,28,29,30]
clf = Hyperparameter(max_depth = depth, min_samples_split=splits, n_features=n_f)

In [9]:
clf.tune(X,y)

({'max_depth': 13, 'min_samples_split': 5, 'n_features': 26},
 0.9736842105263158)