# Imports 

In [1]:
# conda env create -f env.yaml 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import os

# Data setup

In [2]:
np.random.seed(0)
os.environ["OMP_NUM_THREADS"] = "5"

df3 = pd.read_csv("Data/data3.csv")
df3

Unnamed: 0,failures,higher,studytime,Medu,Fedu,Dalc,age,reason_reputation,school,address,internet,G3
0,0,1,2,4,4,1,18,0,0,0,0,6
1,0,1,2,1,1,1,17,0,0,0,1,6
2,3,1,2,1,1,2,15,0,0,0,1,10
3,0,1,3,4,2,1,15,0,0,0,1,15
4,0,1,2,3,3,1,16,0,0,0,0,10
...,...,...,...,...,...,...,...,...,...,...,...,...
1039,1,1,3,2,3,1,19,0,1,1,1,10
1040,0,1,2,3,1,1,18,0,1,0,1,16
1041,0,1,2,1,1,1,18,0,1,0,0,9
1042,0,1,1,3,1,3,17,0,1,0,1,10


In [3]:
df3_copy = df3.copy()
y = df3_copy.pop("G3")
y_binary = [0 if y_i > np.mean(y) else 1 for y_i in y] #making the target class into binary (over/under average score)
X = df3_copy

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)
X_train.shape

(835, 11)

Note: No need to scale, trees are not affected by different scales.

# Tree 1

In [8]:
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from scipy.stats import randint
from sklearn import tree

In [9]:
parameters = {"max_depth": [3, None],
              "max_features": randint(1, X.shape[1]),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

clf = tree.DecisionTreeClassifier()
tree_cv = RandomizedSearchCV(clf, parameters, cv=10, random_state = 10)
tree_cv.fit(X_train,y_train)

RandomizedSearchCV(cv=10, estimator=DecisionTreeClassifier(),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [3, None],
                                        'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000017FC18B3430>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000017FC18B35B0>},
                   random_state=10)

In [None]:
#takes a few seconds and hard to see but just for sanity check
tree.plot_tree(tree_cv.best_estimator_)

[Text(0.6949633379888268, 0.9615384615384616, 'X[3] <= 3.5\nentropy = 1.0\nsamples = 835\nvalue = [413, 422]'),
 Text(0.510038407821229, 0.8846153846153846, 'X[0] <= 0.5\nentropy = 0.984\nsamples = 586\nvalue = [250, 336]'),
 Text(0.360858938547486, 0.8076923076923077, 'X[8] <= 0.5\nentropy = 0.999\nsamples = 466\nvalue = [240, 226]'),
 Text(0.21613128491620112, 0.7307692307692307, 'X[9] <= 0.5\nentropy = 0.989\nsamples = 339\nvalue = [190, 149]'),
 Text(0.0914804469273743, 0.6538461538461539, 'X[10] <= 0.5\nentropy = 0.974\nsamples = 269\nvalue = [160, 109]'),
 Text(0.0223463687150838, 0.5769230769230769, 'X[6] <= 15.5\nentropy = 0.974\nsamples = 37\nvalue = [15, 22]'),
 Text(0.0111731843575419, 0.5, 'entropy = 0.0\nsamples = 7\nvalue = [0, 7]'),
 Text(0.0335195530726257, 0.5, 'X[5] <= 1.5\nentropy = 1.0\nsamples = 30\nvalue = [15, 15]'),
 Text(0.0223463687150838, 0.4230769230769231, 'X[3] <= 2.5\nentropy = 0.988\nsamples = 23\nvalue = [10, 13]'),
 Text(0.0111731843575419, 0.346153846

# Evaluation

#### Training results

In [10]:
print("Best Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

Best Decision Tree Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 7, 'min_samples_leaf': 7}
Best score is 0.6685312679288582


In [11]:
tree_cv.cv_results_

{'mean_fit_time': array([0.00249982, 0.00238814, 0.00190055, 0.00200238, 0.00200028,
        0.00189998, 0.0017972 , 0.00179915, 0.00210016, 0.00229933]),
 'std_fit_time': array([7.97743642e-04, 4.77971780e-04, 3.00289601e-04, 4.52696240e-04,
        1.64422413e-06, 2.99303176e-04, 3.98775526e-04, 3.99616392e-04,
        2.99796191e-04, 4.57773239e-04]),
 'mean_score_time': array([0.00180745, 0.00160854, 0.00139949, 0.00139771, 0.00139973,
        0.00129991, 0.00120261, 0.00140369, 0.00129974, 0.00129771]),
 'std_score_time': array([0.00060872, 0.00049671, 0.00049019, 0.00048697, 0.00048927,
        0.0004577 , 0.00039895, 0.00048776, 0.00045807, 0.00045369]),
 'param_criterion': masked_array(data=['entropy', 'gini', 'gini', 'gini', 'gini', 'gini',
                    'entropy', 'gini', 'gini', 'entropy'],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_max_depth'

In [12]:
print(f"Mean test scores: ", tree_cv.cv_results_["mean_test_score"])

Mean test scores:  [0.65283993 0.6528973  0.63486804 0.64674412 0.63237235 0.64079174
 0.62155766 0.61069994 0.64566839 0.66853127]


#### Test set results

In [13]:
#cross validation

CV_test = cross_val_score(tree_cv, X_test, y_test, cv=10)
print(CV_test)
print("Mean test accuracy: ",np.mean(CV_test))

[0.57142857 0.71428571 0.71428571 0.61904762 0.61904762 0.42857143
 0.71428571 0.61904762 0.66666667 0.55      ]
Mean test accuracy:  0.6216666666666667


In [14]:
#overall test
print("Training accuracy: ",tree_cv.score(X_train, y_train))
print("Test accuracy: ", tree_cv.score(X_test, y_test))

Training accuracy:  0.7401197604790419
Test accuracy:  0.6267942583732058


# Tree 2

TODO

# Tree 3

TODO