# Project 1 – Decision Trees and Random Forests

In [65]:
# Reload all modules without having to restart the kernel
# Useful for development if you have edited any of the external code files.
%load_ext autoreload
%autoreload 2

# Imports
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# My implementations
from decision_tree import DecisionTree
from random_forest import RandomForest
from sklearn.model_selection import train_test_split, KFold
from itertools import product
from sklearn import metrics
from typing import Any

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Dataset

Do data loading, exploration and preprocessing as you see fit.

Here is some code to load the dataset to get you started.

In [66]:
data = np.genfromtxt("datasets/wine_dataset_small.csv", delimiter=",", dtype=float, names=True)

feature_names = list(data.dtype.names[:-1])
target_name = data.dtype.names[-1]

X = np.array([data[feature] for feature in feature_names]).T
y = data[target_name].astype(int)

print(f"Feature columns names: {feature_names}")
print(f"Target column name: {target_name}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(X)

Feature columns names: ['citric_acid', 'residual_sugar', 'pH', 'sulphates', 'alcohol']
Target column name: type
X shape: (500, 5)
y shape: (500,)
[[ 0.13  1.6   3.34  0.59  9.2 ]
 [ 0.1   2.8   3.6   0.66 10.2 ]
 [ 0.32  1.9   3.2   0.55  9.5 ]
 ...
 [ 0.36  7.    3.04  0.32 11.  ]
 [ 0.42  2.1   3.11  0.78 12.4 ]
 [ 0.15  9.7   3.05  0.3   9.1 ]]


In [67]:
seed = 4206969
np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, shuffle=True, random_state=seed)

In [68]:
max_depths = [3, 4, 5, 6, 7, 8, 9, 10, 25]
criterions = ["entropy", "gini"]
n_estimators = [5, 10, 20, 50]
max_features = ["log2", "sqrt", None]
hyperparameter_list = list(product(max_depths, criterions, n_estimators, max_features))
def tune_hyperparameters(hyperparameter_list: list[Any], classifier: type) -> tuple[int, str, int, str]:
    global seed
    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    min_mse = float("inf")
    counter = 0
    N = len(hyperparameter_list)
    for max_depth, criterion, n_estimator, max_feature in hyperparameter_list:
        if counter % 5 == 0:
                print(f"{round(counter/N*100, 2)}%")
        counter += 1
        val_scores = []
        forest = classifier(n_estimators=n_estimator, max_depth=max_depth, criterion=criterion, max_features=max_feature, random_state=seed)
        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
            forest.fit(X_train_fold, y_train_fold)
            val_score = metrics.mean_squared_error(y_val_fold, forest.predict(X_val_fold))
            val_scores.append(val_score)
        cv_score = np.mean(val_scores)
        if cv_score < min_mse:
            min_mse = cv_score
            best_max_depth = max_depth
            best_criterion = criterion
            best_n_estimator = n_estimator
            best_max_feature = max_feature

    return (best_max_depth, best_criterion, best_n_estimator, best_max_feature)

## Wine dataset

In [70]:
print(seed)
best_max_depth, best_criterion, best_n_estimator, best_max_feature = tune_hyperparameters(hyperparameter_list, RandomForest)
print(best_max_depth, best_criterion, best_n_estimator, best_max_feature)
forest = RandomForest(n_estimators=best_n_estimator, max_depth=best_max_depth, criterion=best_criterion, max_features=best_max_feature, random_state=seed)
forest.fit(X_train, y_train)
print(f"Training accuracy: {metrics.accuracy_score(y_train, forest.predict(X_train))}")
print(f"Validation accuracy: {metrics.accuracy_score(y_test, forest.predict(X_test))}")

4206969
0.0%
2.31%
4.63%
6.94%
9.26%
11.57%
13.89%
16.2%
18.52%
20.83%
23.15%
25.46%
27.78%
30.09%
32.41%
34.72%
37.04%
39.35%
41.67%
43.98%
46.3%
48.61%
50.93%
53.24%
55.56%
57.87%
60.19%
62.5%
64.81%
67.13%
69.44%
71.76%
74.07%
76.39%
78.7%
81.02%
83.33%
85.65%
87.96%
90.28%
92.59%
94.91%
97.22%
99.54%
10 gini 5 log2
Training accuracy: 0.9057142857142857
Validation accuracy: 0.62


In [31]:
best_max_depth, best_criterion, best_n_estimator, best_max_feature = tune_hyperparameters(hyperparameter_list, RandomForestClassifier)
forest = RandomForestClassifier(n_estimators=best_n_estimator, max_depth=best_max_depth, criterion=best_criterion, max_features=best_max_feature, random_state=seed)
forest.fit(X_train, y_train)
print(f"Training accuracy: {metrics.accuracy_score(y_train, forest.predict(X_train))}")
print(f"Validation accuracy: {metrics.accuracy_score(y_test, forest.predict(X_test))}")

0.0%
2.31%
4.63%
6.94%
9.26%
11.57%
13.89%
16.2%
18.52%
20.83%
23.15%
25.46%
27.78%
30.09%
32.41%
34.72%
37.04%
39.35%
41.67%
43.98%
46.3%
48.61%
50.93%
53.24%
55.56%
57.87%
60.19%
62.5%
64.81%
67.13%
69.44%
71.76%
74.07%
76.39%
78.7%
81.02%
83.33%
85.65%
87.96%
90.28%
92.59%
94.91%
97.22%
99.54%
Training accuracy: 0.9485714285714286
Validation accuracy: 0.8533333333333334


## Coffee dataset

In [None]:
data = np.genfromtxt("datasets/wine_dataset_small.csv", delimiter=",", dtype=float, names=True)

feature_names = list(data.dtype.names[:-1])
target_name = data.dtype.names[-1]

X = np.array([data[feature] for feature in feature_names]).T
y = data[target_name].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, shuffle=True, random_state=seed)

In [None]:
best_max_depth, best_criterion, best_n_estimator, best_max_feature = tune_hyperparameters(hyperparameter_list, RandomForest)
print(best_max_depth, best_criterion, best_n_estimator, best_max_feature)
forest = RandomForestClassifier(n_estimators=best_n_estimator, max_depth=best_max_depth, criterion=best_criterion, max_features=best_max_feature, random_state=seed)
forest.fit(X_train, y_train)
print(f"Training accuracy: {metrics.accuracy_score(y_train, forest.predict(X_train))}")
print(f"Validation accuracy: {metrics.accuracy_score(y_test, forest.predict(X_test))}")

In [None]:
best_max_depth, best_criterion, best_n_estimator, best_max_feature = tune_hyperparameters(hyperparameter_list, RandomForestClassifier)
forest = RandomForestClassifier(n_estimators=best_n_estimator, max_depth=best_max_depth, criterion=best_criterion, max_features=best_max_feature, random_state=seed)
forest.fit(X_train, y_train)
print(f"Training accuracy: {metrics.accuracy_score(y_train, forest.predict(X_train))}")
print(f"Validation accuracy: {metrics.accuracy_score(y_test, forest.predict(X_test))}")