Step 1

Building a custom decision tree with information gain

In [1]:
import numpy as np

class CustomDecisionTree:
  def __init__(self, max_depth=None):
    self.max_depth = max_depth
    self.tree = None

  def fit(self, X, y):
    self.tree = self._build_tree(X, y)

  def _build_tree(self, X, y, depth=0):
    num_samples, num_features = X.shape
    unique_classes = np.unique(y)

    if len(unique_classes) == 1:
      return {'class': unique_classes[0]}

    if num_samples == 0 or (self.max_depth and depth >= self.max_depth):
      return {'class': np.bincount(y).argmax()}

    best_info_gain = -float('inf')
    best_split = None

    for feature_idx in range(num_features):
      thresholds = np.unique(X[:, feature_idx])

      for threshold in thresholds:
        left_mask = X[:, feature_idx] <= threshold
        right_mask = ~left_mask
        left_y = y[left_mask]
        right_y = y[right_mask]

        info_gain = self._information_gain(y, left_y, right_y)

        if info_gain > best_info_gain:
          best_info_gain = info_gain
          best_split = {
            'feature_idx': feature_idx,
            'threshold': threshold,
            'left_y': left_y,
            'right_y': right_y,
          }

      if best_split is None:
        return {'class': np.bincount(y).argmax()}

    left_tree = self._build_tree(
      X[best_split['left_y']],
      best_split['left_y'],
      depth + 1
    )

    right_tree = self._build_tree(
      X[best_split['right_y']],
      best_split['right_y'],
      depth + 1
    )

    return {
      'feature_idx': best_split['feature_idx'],
      'threshold': best_split['threshold'],
      'left_tree': left_tree,
      'right_tree': right_tree
    }

  def _information_gain(self, parent, left, right):
    parent_entropy = self._entropy(parent)
    left_entropy = self._entropy(left)
    right_entropy = self._entropy(right)

    weighted_avg_entropy = (
      (len(left) / len(parent)) * left_entropy +
      (len(right) / len(parent)) * right_entropy
    )

    return parent_entropy - weighted_avg_entropy

  def _entropy(self, y):
    class_probs = np.bincount(y) / len(y)
    return -np.sum(class_probs * np.log2(class_probs + 1e-9))

  def predict(self, X):
    return [self._predict_single(x, self.tree) for x in X]

  def _predict_single(self, x, tree):
    if 'class' in tree:
      return tree['class']
    feature_val = x[tree['feature_idx']]
    if feature_val<=tree['threshold']:
      return self._predict_single(x, tree['left_tree'])
    else:
      return self._predict_single(x, tree['right_tree'])



Step 2

Load and split iris dataset

In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
data = load_iris()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Step 3

Train and Evaluate a custom decision tree

In [3]:
custom_tree = CustomDecisionTree(max_depth=3)
custom_tree.fit(X_train, y_train)
y_pred_custom = custom_tree.predict(X_test)
accuracy_custom = accuracy_score(y_test, y_pred_custom)

Step 4

Train and Evaluate a Scikit learn decision tree

In [4]:
sklearn_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
sklearn_tree.fit(X_train, y_train)
y_pred_sklearn = sklearn_tree.predict(X_test)
accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)
print(f"Scikit-learn Decision Tree Accuracy: {accuracy_sklearn:.4f}")


Scikit-learn Decision Tree Accuracy: 1.0000


Result comparision

In [5]:
print(f"Accuracy comparision")
print(f"Custom Decision Tree Accuracy: {accuracy_custom:.4f}")
print(f"Scikit-learn Decision Tree Accuracy: {accuracy_sklearn:.4f}")

Accuracy comparision
Custom Decision Tree Accuracy: 0.8000
Scikit-learn Decision Tree Accuracy: 1.0000


 3. Exercise:

Ensemble Methods and Hyperparameter Tuning

Load the wine dataset

In [6]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
data = load_wine()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Step 1

Implement Classification Models

In [7]:
custom_forest = RandomForestClassifier(max_depth=3)
custom_forest.fit(X_train, y_train)
y_pred_custom = custom_forest.predict(X_test)
f1_custom = f1_score(y_test, y_pred_custom, average='weighted')
custom_tree2 = DecisionTreeClassifier(max_depth=3)
custom_tree2.fit(X_train, y_train)
y_pred_custom2 = custom_tree2.predict(X_test)
f1_custom2 = f1_score(y_test, y_pred_custom2, average='weighted')
print(f"Custom Random Forest F1 Score: {f1_custom:.4f}")
print(f"Custom Decision Tree F1 Score: {f1_custom2:.4f}")


Custom Random Forest F1 Score: 1.0000
Custom Decision Tree F1 Score: 0.9449


Hyperparameter tuning

In [8]:
from sklearn.model_selection import GridSearchCV
param_grid = {
   'n_estimators': [100, 200],
   'max_depth': [None, 10, 20],
   'min_samples_split': [2, 5],
   'min_samples_leaf': [1, 2],
   'bootstrap': [True, False]
}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best Params:", grid_search.best_params_)

Best Params: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}


Implementing regression model

In [9]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X, y)
y_pred_dr = regressor.predict(X_test)
random_forest_regressor = RandomForestRegressor(n_estimators=100, random_state=0)
random_forest_regressor.fit(X, y)
y_pred_rfr = random_forest_regressor.predict(X_test)
#hyperparameter tuning
param_grid2= {
'n_estimators': [100, 200],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5],
'min_samples_leaf': [1, 2],
'bootstrap': [True, False]
}
random_search = RandomizedSearchCV(RandomForestClassifier(), param_grid, cv=5)
random_search.fit(X_train, y_train)
print("Best Params:", random_search.best_params_)


Best Params: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 10, 'bootstrap': True}
