# Project 1 – Decision Trees and Random Forests

In [21]:
# Reload all modules without having to restart the kernel
# Useful for development if you have edited any of the external code files.
%load_ext autoreload
%autoreload 2

# Imports
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# ... add more imports as needed

# My implementations
from decision_tree import DecisionTree
from random_forest import RandomForest
from sklearn.model_selection import train_test_split, KFold
from itertools import product
from sklearn import metrics

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Dataset

Do data loading, exploration and preprocessing as you see fit.

Here is some code to load the dataset to get you started.

In [22]:
data = np.genfromtxt("datasets/wine_dataset_small.csv", delimiter=",", dtype=float, names=True)

feature_names = list(data.dtype.names[:-1])
target_name = data.dtype.names[-1]

X = np.array([data[feature] for feature in feature_names]).T
y = data[target_name].astype(int)

print(f"Feature columns names: {feature_names}")
print(f"Target column name: {target_name}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(X)

Feature columns names: ['citric_acid', 'residual_sugar', 'pH', 'sulphates', 'alcohol']
Target column name: type
X shape: (500, 5)
y shape: (500,)
[[ 0.13  1.6   3.34  0.59  9.2 ]
 [ 0.1   2.8   3.6   0.66 10.2 ]
 [ 0.32  1.9   3.2   0.55  9.5 ]
 ...
 [ 0.36  7.    3.04  0.32 11.  ]
 [ 0.42  2.1   3.11  0.78 12.4 ]
 [ 0.15  9.7   3.05  0.3   9.1 ]]


In [23]:
seed = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, shuffle=True, random_state=seed)

In [24]:
max_depths = [3, 4, 5, 6, 7, 8, 9, 10, 25]
criterions = ["entropy", "gini"]
n_estimators = [1, 5, 10, 20]
max_features = ["log2", "sqrt", None]
hyperparameter_list = list(product(max_depths, criterions, n_estimators, max_features))

kf = KFold(n_splits=5, shuffle=True, random_state=seed)
min_mse = float("inf")
counter = 0
N = len(hyperparameter_list)
for max_depth, criterion, n_estimator, max_feature in hyperparameter_list:
    if counter % 5 == 0:
        print(f"{counter/N*100}%")
        print("Estimators: ", n_estimator)
    counter += 1
    val_scores = []
    forest = RandomForest(n_estimators=n_estimator, max_depth=max_depth, criterion=criterion, max_features=max_feature)
    for train_index, val_index in kf.split(X_train):
        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
        forest.fit(X_train_fold, y_train_fold)
        val_score = metrics.mean_squared_error(y_val_fold, forest.predict(X_val_fold))
        val_scores.append(val_score)
    cv_score = np.mean(val_scores)
    if cv_score < min_mse:
        min_mse = cv_score
        best_max_depth = max_depth
        best_criterion = criterion
        best_n_estimator = n_estimator
        best_max_feature = max_feature

print(best_max_depth, best_criterion, best_n_estimator, best_max_feature)

0.0%
Estimators:  1
2.314814814814815%
Estimators:  5
4.62962962962963%
Estimators:  20
6.944444444444445%
Estimators:  5
9.25925925925926%
Estimators:  10
11.574074074074074%
Estimators:  1
13.88888888888889%
Estimators:  10
16.203703703703702%
Estimators:  20
18.51851851851852%
Estimators:  5
20.833333333333336%
Estimators:  20
23.14814814814815%
Estimators:  1
25.462962962962965%
Estimators:  10
27.77777777777778%
Estimators:  1
30.09259259259259%
Estimators:  5
32.407407407407405%
Estimators:  20
34.72222222222222%
Estimators:  5
37.03703703703704%
Estimators:  10
39.351851851851855%
Estimators:  1
41.66666666666667%
Estimators:  10
43.98148148148148%
Estimators:  20
46.2962962962963%
Estimators:  5
48.61111111111111%
Estimators:  20
50.92592592592593%
Estimators:  1
53.24074074074075%
Estimators:  10
55.55555555555556%
Estimators:  1
57.870370370370374%
Estimators:  5
60.18518518518518%
Estimators:  20
62.5%
Estimators:  5
64.81481481481481%
Estimators:  10
67.12962962962963%
Esti

In [25]:
forest = RandomForest(n_estimators=best_max_depth, max_depth=best_max_depth, criterion=best_criterion, max_features=best_max_feature)
forest.fit(X_train, y_train)
print(f"Training accuracy: {metrics.accuracy_score(y_train, forest.predict(X_train))}")
print(f"Validation accuracy: {metrics.accuracy_score(y_test, forest.predict(X_test))}")

Training accuracy: 0.9914285714285714
Validation accuracy: 0.8333333333333334


In [26]:
max_depths = [3, 4, 5, 6, 7, 8, 9, 10, 25]
criterions = ["entropy", "gini"]
n_estimators = [1, 5, 10, 20]
max_features = ["log2", "sqrt", None]
hyperparameter_list = list(product(max_depths, criterions, n_estimators, max_features))

kf = KFold(n_splits=5, shuffle=True, random_state=seed)
min_mse = float("inf")
counter = 0
N = len(hyperparameter_list)
for max_depth, criterion, n_estimator, max_feature in hyperparameter_list:
    if counter % 5 == 0:
        print(f"{counter/N*100}%")
        print("Estimators: ", n_estimator)
    counter += 1
    val_scores = []
    forest = RandomForestClassifier(n_estimators=n_estimator, max_depth=max_depth, criterion=criterion, max_features=max_feature)
    for train_index, val_index in kf.split(X_train):
        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
        forest.fit(X_train_fold, y_train_fold)
        val_score = metrics.mean_squared_error(y_val_fold, forest.predict(X_val_fold))
        val_scores.append(val_score)
    cv_score = np.mean(val_scores)
    if cv_score < min_mse:
        min_mse = cv_score
        best_max_depth = max_depth
        best_criterion = criterion
        best_n_estimator = n_estimator
        best_max_feature = max_feature

print(best_max_depth, best_criterion, best_n_estimator, best_max_feature)

0.0%
Estimators:  1
2.314814814814815%
Estimators:  5
4.62962962962963%
Estimators:  20
6.944444444444445%
Estimators:  5
9.25925925925926%
Estimators:  10
11.574074074074074%
Estimators:  1
13.88888888888889%
Estimators:  10
16.203703703703702%
Estimators:  20
18.51851851851852%
Estimators:  5
20.833333333333336%
Estimators:  20
23.14814814814815%
Estimators:  1
25.462962962962965%
Estimators:  10
27.77777777777778%
Estimators:  1
30.09259259259259%
Estimators:  5
32.407407407407405%
Estimators:  20
34.72222222222222%
Estimators:  5
37.03703703703704%
Estimators:  10
39.351851851851855%
Estimators:  1
41.66666666666667%
Estimators:  10
43.98148148148148%
Estimators:  20
46.2962962962963%
Estimators:  5
48.61111111111111%
Estimators:  20
50.92592592592593%
Estimators:  1
53.24074074074075%
Estimators:  10
55.55555555555556%
Estimators:  1
57.870370370370374%
Estimators:  5
60.18518518518518%
Estimators:  20
62.5%
Estimators:  5
64.81481481481481%
Estimators:  10
67.12962962962963%
Esti

In [27]:
forest = RandomForestClassifier(n_estimators=best_max_depth, max_depth=best_max_depth, criterion=best_criterion, max_features=best_max_feature)
forest.fit(X_train, y_train)
print(f"Training accuracy: {metrics.accuracy_score(y_train, forest.predict(X_train))}")
print(f"Validation accuracy: {metrics.accuracy_score(y_test, forest.predict(X_test))}")

Training accuracy: 0.98
Validation accuracy: 0.8733333333333333
