# Project 1 – Decision Trees and Random Forests

In [None]:
# Reload all modules without having to restart the kernel
# Useful for development if you have edited any of the external code files.
%load_ext autoreload
%autoreload 2
 
# Imports
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# My implementations
from decision_tree import DecisionTree
from random_forest import RandomForest
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

## Wine dataset

In [None]:
data = np.genfromtxt("datasets/wine_dataset_small.csv", delimiter=",", dtype=float, names=True)

feature_names = list(data.dtype.names[:-1])
target_name = data.dtype.names[-1]

X = np.array([data[feature] for feature in feature_names]).T
y = data[target_name].astype(int)

print(f"Feature columns names: {feature_names}")
print(f"Target column name: {target_name}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

In [21]:
seed = 42
np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, shuffle=True, random_state=seed)

### Decision tree

In [22]:
param_grid = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, None],
    'criterion': ["entropy", "gini"],
    'max_features': ["log2", "sqrt", None]
}

Our implementation

In [None]:
grid_search = GridSearchCV(estimator=DecisionTree(random_state=seed), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring="accuracy")
grid_search.fit(X_train, y_train)
forest = grid_search.best_estimator_
print(forest.get_params())
print(f"Training accuracy: {metrics.accuracy_score(y_train, forest.predict(X_train))}")
print(f"Validation accuracy: {metrics.accuracy_score(y_test, forest.predict(X_test))}")

Sklearn's implementation

In [None]:
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=seed), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring="accuracy")
grid_search.fit(X_train, y_train)
forest = grid_search.best_estimator_
print(forest.get_params())
print(f"Training accuracy: {metrics.accuracy_score(y_train, forest.predict(X_train))}")
print(f"Validation accuracy: {metrics.accuracy_score(y_test, forest.predict(X_test))}")

### Random forest

In [25]:
param_grid = {
    'n_estimators': [5, 6, 7, 8, 9, 10, 25, 30, 50, 75, 100],
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, None],
    'criterion': ["entropy", "gini"],
    'max_features': ["log2", "sqrt", None]
}

Our implementation

In [None]:
grid_search = GridSearchCV(estimator=RandomForest(random_state=seed), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring="accuracy")
grid_search.fit(X_train, y_train)
forest = grid_search.best_estimator_
print(forest.get_params())
print(f"Training accuracy: {metrics.accuracy_score(y_train, forest.predict(X_train))}")
print(f"Validation accuracy: {metrics.accuracy_score(y_test, forest.predict(X_test))}")

Sklearn's implementation

In [None]:
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=seed), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring="accuracy")
grid_search.fit(X_train, y_train)
forest = grid_search.best_estimator_
print(forest.get_params())
print(f"Training accuracy: {metrics.accuracy_score(y_train, forest.predict(X_train))}")
print(f"Validation accuracy: {metrics.accuracy_score(y_test, forest.predict(X_test))}")

## Coffee dataset

In [None]:
data = np.genfromtxt("datasets/coffee_data.csv", delimiter=",", dtype=float, names=True)

feature_names = list(data.dtype.names[:-1])
target_name = data.dtype.names[-1]

X = np.array([data[feature] for feature in feature_names]).T
y = data[target_name].astype(int)

print(f"Feature columns names: {feature_names}")
print(f"Target column name: {target_name}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, shuffle=True, random_state=seed)

### Decision tree

In [29]:
param_grid = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, None],
    'criterion': ["entropy", "gini"],
    'max_features': ["log2", "sqrt", None]
}

Our implementation

In [None]:
grid_search = GridSearchCV(estimator=DecisionTree(random_state=seed), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring="accuracy")
grid_search.fit(X_train, y_train)
forest = grid_search.best_estimator_
print(forest.get_params())
print(f"Training accuracy: {metrics.accuracy_score(y_train, forest.predict(X_train))}")
print(f"Validation accuracy: {metrics.accuracy_score(y_test, forest.predict(X_test))}")

Sklearn's implementation

In [None]:
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=seed), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring="accuracy")
grid_search.fit(X_train, y_train)
forest = grid_search.best_estimator_
print(forest.get_params())
print(f"Training accuracy: {metrics.accuracy_score(y_train, forest.predict(X_train))}")
print(f"Validation accuracy: {metrics.accuracy_score(y_test, forest.predict(X_test))}")

### Random forest

In [32]:
param_grid = {
    'n_estimators': [5, 6, 7, 8, 9, 10, 25, 30, 50, 75, 100],
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, None],
    'criterion': ["entropy", "gini"],
    'max_features': ["log2", "sqrt", None]
}

Our implementation

In [None]:
grid_search = GridSearchCV(estimator=RandomForest(random_state=seed), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring="accuracy")
grid_search.fit(X_train, y_train)
forest = grid_search.best_estimator_
print(forest.get_params())
print(f"Training accuracy: {metrics.accuracy_score(y_train, forest.predict(X_train))}")
print(f"Validation accuracy: {metrics.accuracy_score(y_test, forest.predict(X_test))}")

Sklearn's implementation

In [None]:
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=seed), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring="accuracy")
grid_search.fit(X_train, y_train)
forest = grid_search.best_estimator_
print(forest.get_params())
print(f"Training accuracy: {metrics.accuracy_score(y_train, forest.predict(X_train))}")
print(f"Validation accuracy: {metrics.accuracy_score(y_test, forest.predict(X_test))}")