# Project 1 – Decision Trees and Random Forests

In [81]:
# Reload all modules without having to restart the kernel
# Useful for development if you have edited any of the external code files.
%load_ext autoreload
%autoreload 2

# Imports
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# My implementations
from decision_tree import DecisionTree
from random_forest import RandomForest
from sklearn.model_selection import train_test_split, KFold
from itertools import product
from sklearn import metrics
from typing import Any

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Dataset

Do data loading, exploration and preprocessing as you see fit.

Here is some code to load the dataset to get you started.

In [82]:
data = np.genfromtxt("datasets/wine_dataset_small.csv", delimiter=",", dtype=float, names=True)

feature_names = list(data.dtype.names[:-1])
target_name = data.dtype.names[-1]

X = np.array([data[feature] for feature in feature_names]).T
y = data[target_name].astype(int)

print(f"Feature columns names: {feature_names}")
print(f"Target column name: {target_name}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

Feature columns names: ['citric_acid', 'residual_sugar', 'pH', 'sulphates', 'alcohol']
Target column name: type
X shape: (500, 5)
y shape: (500,)


In [83]:
seed = 0
np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, shuffle=True, random_state=seed)

In [104]:
max_depths = [7, 8, 9, 10, 12]
criterions = ["entropy", "gini"]
n_estimators = [5, 6, 7, 8, 9, 10]
max_features = ["log2", "sqrt", None]
hyperparameter_list = list(product(max_depths, criterions, n_estimators, max_features))

kf = KFold(n_splits=5, shuffle=True, random_state=seed)
def tune_hyperparameters(hyperparameter_list: list[Any], classifier: type) -> tuple[int, str, int, str | None]:
    mx_score = 0
    counter = 0
    N = len(hyperparameter_list)
    for max_depth, criterion, n_estimator, max_feature in hyperparameter_list:
        if counter % 5 == 0:
            print(f"{round(counter/N*100, 1)}%")
        counter += 1
        val_scores = []
        forest = classifier(n_estimators=n_estimator, max_depth=max_depth, criterion=criterion, max_features=max_feature, random_state=seed)
        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
            forest.fit(X_train_fold, y_train_fold)
            val_score = metrics.accuracy_score(y_val_fold, forest.predict(X_val_fold))
            val_scores.append(val_score)
        cv_score = np.mean(val_scores)
        if cv_score > mx_score:
            mx_score = cv_score
            best_max_depth = max_depth
            best_criterion = criterion
            best_n_estimator = n_estimator
            best_max_feature = max_feature
        # print(cv_score)
    return best_max_depth, best_criterion, best_n_estimator, best_max_feature


In [73]:
best_max_depth, best_criterion, best_n_estimator, best_max_feature = tune_hyperparameters(hyperparameter_list, RandomForest)
print(best_max_depth, best_criterion, best_n_estimator, best_max_feature)
forest = RandomForest(n_estimators=best_n_estimator, max_depth=best_max_depth, criterion=best_criterion, max_features=best_max_feature, random_state=seed)
forest.fit(X_train, y_train)
print(f"Training accuracy: {metrics.accuracy_score(y_train, forest.predict(X_train))}")
print(f"Validation accuracy: {metrics.accuracy_score(y_test, forest.predict(X_test))}")

0.0%
2.3%
4.6%
6.9%
9.3%
11.6%
13.9%
16.2%
18.5%
20.8%
23.1%
25.5%
27.8%
30.1%
32.4%
34.7%
37.0%
39.4%
41.7%
44.0%
46.3%
48.6%
50.9%
53.2%
55.6%
57.9%
60.2%
62.5%
64.8%
67.1%
69.4%
71.8%
74.1%
76.4%
78.7%
81.0%
83.3%
85.6%
88.0%
90.3%
92.6%
94.9%
97.2%
99.5%
10 entropy 50 log2
Training accuracy: 0.9971428571428571
Validation accuracy: 0.8466666666666667


In [74]:
best_max_depth, best_criterion, best_n_estimator, best_max_feature = tune_hyperparameters(hyperparameter_list, RandomForestClassifier)
print(best_max_depth, best_criterion, best_n_estimator, best_max_feature)
forest = RandomForestClassifier(n_estimators=best_n_estimator, max_depth=best_max_depth, criterion=best_criterion, max_features=best_max_feature, random_state=seed)
forest.fit(X_train, y_train)
print(f"Training accuracy: {metrics.accuracy_score(y_train, forest.predict(X_train))}")
print(f"Validation accuracy: {metrics.accuracy_score(y_test, forest.predict(X_test))}")

0.0%
2.3%
4.6%
6.9%
9.3%
11.6%
13.9%
16.2%
18.5%
20.8%
23.1%
25.5%
27.8%
30.1%
32.4%
34.7%
37.0%
39.4%
41.7%
44.0%
46.3%
48.6%
50.9%
53.2%
55.6%
57.9%
60.2%
62.5%
64.8%
67.1%
69.4%
71.8%
74.1%
76.4%
78.7%
81.0%
83.3%
85.6%
88.0%
90.3%
92.6%
94.9%
97.2%
99.5%
6 entropy 20 log2
Training accuracy: 0.9514285714285714
Validation accuracy: 0.86


## Coffee dataset

In [75]:
data = np.genfromtxt("datasets/coffee_data.csv", delimiter=",", dtype=float, names=True)

feature_names = list(data.dtype.names[:-1])
target_name = data.dtype.names[-1]

X = np.array([data[feature] for feature in feature_names]).T
y = data[target_name].astype(int)

print(f"Feature columns names: {feature_names}")
print(f"Target column name: {target_name}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, shuffle=True, random_state=seed)

Feature columns names: ['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance', 'Uniformity', 'Sweetness']
Target column name: CountryofOrigin
X shape: (419, 8)
y shape: (419,)


In [105]:
best_max_depth, best_criterion, best_n_estimator, best_max_feature = tune_hyperparameters(hyperparameter_list, RandomForest)
print(best_max_depth, best_criterion, best_n_estimator, best_max_feature)
forest = RandomForest(n_estimators=best_n_estimator, max_depth=best_max_depth, criterion=best_criterion, max_features=best_max_feature, random_state=seed)
forest.fit(X_train, y_train)
print(f"Training accuracy: {metrics.accuracy_score(y_train, forest.predict(X_train))}")
print(f"Validation accuracy: {metrics.accuracy_score(y_test, forest.predict(X_test))}")


0.0%
2.8%
5.6%
8.3%
11.1%
13.9%


KeyboardInterrupt: 

In [102]:
best_max_depth, best_criterion, best_n_estimator, best_max_feature = tune_hyperparameters(hyperparameter_list, RandomForestClassifier)
print(best_max_depth, best_criterion, best_n_estimator, best_max_feature)
forest = RandomForestClassifier(n_estimators=best_n_estimator, max_depth=best_max_depth, criterion=best_criterion, max_features=best_max_feature, random_state=seed)
forest.fit(X_train, y_train)
print(f"Training accuracy: {metrics.accuracy_score(y_train, forest.predict(X_train))}")
print(f"Validation accuracy: {metrics.accuracy_score(y_test, forest.predict(X_test))}")

"""
12 entropy 8 log2
Training accuracy: 0.9857142857142858
Validation accuracy: 0.8533333333333334
"""

0.0%
2.8%
5.6%
8.3%
11.1%
13.9%
16.7%
19.4%
22.2%
25.0%
27.8%
30.6%
33.3%
36.1%
38.9%
41.7%
44.4%
47.2%
50.0%
52.8%
55.6%
58.3%
61.1%
63.9%
66.7%
69.4%
72.2%
75.0%
77.8%
80.6%
83.3%
86.1%
88.9%
91.7%
94.4%
97.2%
9 entropy 10 None
Training accuracy: 0.9685714285714285
Validation accuracy: 0.86


'\n12 entropy 8 log2\nTraining accuracy: 0.9857142857142858\nValidation accuracy: 0.8533333333333334\n'