# Lab 8

## TODO:

- [x] Split into training and test set (stratified)
- [x] Decision tree on dataset
- [ ] Add option to run on the smile dataset (optional for more consistent results)
- [ ] Make table (plot accuracy vs some hyperparams)
  - [ ] 10-fold CV
  - [ ] Accuracy
  - [ ] TP rate
  - [ ] FP rate
  - [ ] precision
  - [ ] recall
  - [ ] F measure
  - [ ] ROC area
  - [ ] Graph table
- [x] Random search hyperparameters
- [ ] 3 trees with different train/test sets (optional, exact details unclear)

## Notes:
- When doing k-fold validation make sure that the folds are stratified

## Imports 

In [30]:
# conda env create -f env.yaml 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import os

## Data setup

In [31]:
np.random.seed(0)
os.environ["OMP_NUM_THREADS"] = "5"

df3 = pd.read_csv("Data/data3.csv")
df3

Unnamed: 0,failures,higher,studytime,Medu,Fedu,Dalc,age,reason_reputation,school,address,internet,G3
0,0,1,2,4,4,1,18,0,0,0,0,6
1,0,1,2,1,1,1,17,0,0,0,1,6
2,3,1,2,1,1,2,15,0,0,0,1,10
3,0,1,3,4,2,1,15,0,0,0,1,15
4,0,1,2,3,3,1,16,0,0,0,0,10
...,...,...,...,...,...,...,...,...,...,...,...,...
1039,1,1,3,2,3,1,19,0,1,1,1,10
1040,0,1,2,3,1,1,18,0,1,0,1,16
1041,0,1,2,1,1,1,18,0,1,0,0,9
1042,0,1,1,3,1,3,17,0,1,0,1,10


In [32]:
df3_copy = df3.copy()
y = df3_copy.pop("G3")
y_binary = [0 if y_i > np.mean(y) else 1 for y_i in y] #making the target class into binary (over/under average score)
X = df3_copy
# display(X)
# print(y_binary)

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, stratify=y_binary, random_state=42,)
X_train.shape

(835, 11)

Note: No need to scale, trees are not affected by different scales.

## Tree 1

In [34]:
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from scipy.stats import randint
from sklearn import tree

In [35]:
parameters = {"max_depth": [3, None],
              "max_features": randint(1, X.shape[1]),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

clf = tree.DecisionTreeClassifier()
tree_cv = RandomizedSearchCV(clf, parameters, cv=10, random_state = 10)
tree_cv.fit(X_train,y_train)

In [36]:
#takes a few seconds and hard to see but just for sanity check
# tree.plot_tree(tree_cv.best_estimator_)

## Evaluation

### Training results

In [37]:
print("Best Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

Best Decision Tree Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 7, 'min_samples_leaf': 7}
Best score is 0.6633820998278829


In [38]:
tree_cv.cv_results_

{'mean_fit_time': array([0.00110202, 0.00145452, 0.00111938, 0.0011169 , 0.00121758,
        0.00104852, 0.00100079, 0.00096245, 0.00120804, 0.0012512 ]),
 'std_fit_time': array([5.37394513e-04, 4.84099529e-04, 3.76303710e-04, 2.98383194e-04,
        4.48861098e-04, 4.21594607e-04, 4.37184528e-06, 7.91920457e-04,
        6.65884486e-04, 4.03842908e-04]),
 'mean_score_time': array([0.00079687, 0.00053179, 0.00048356, 0.00059123, 0.00076258,
        0.00089195, 0.00100195, 0.00065501, 0.00084229, 0.00095692]),
 'std_score_time': array([5.85727005e-04, 4.86107293e-04, 4.86531779e-04, 6.14994546e-04,
        5.27531871e-04, 5.05701301e-04, 6.53132021e-06, 5.51216712e-04,
        6.76291283e-04, 3.60801809e-04]),
 'param_criterion': masked_array(data=['entropy', 'gini', 'gini', 'gini', 'gini', 'gini',
                    'entropy', 'gini', 'gini', 'entropy'],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?'

In [43]:
table = pd.DataFrame(tree_cv.cv_results_["params"])
table["test_method"] = "10CV"
table["accuracy"] = tree_cv.cv_results_["mean_test_score"]
table = table.sort_values(by=["accuracy"], ascending=False)
table

Unnamed: 0,criterion,max_depth,max_features,min_samples_leaf,test_method,accuracy
9,entropy,,7,7,10CV,0.663382
0,entropy,,5,8,10CV,0.650158
1,gini,,10,6,10CV,0.639443
5,gini,3.0,7,1,10CV,0.631096
3,gini,,1,3,10CV,0.626291
4,gini,3.0,5,4,10CV,0.620324
6,entropy,3.0,2,1,10CV,0.620209
8,gini,,6,6,10CV,0.617814
7,gini,3.0,2,4,10CV,0.608405
2,gini,,2,3,10CV,0.605938


#### Test set results

In [40]:
#cross validation

CV_test = cross_val_score(tree_cv, X_test, y_test, cv=10)
print(CV_test)
print("Mean test accuracy: ",np.mean(CV_test))

[0.52380952 0.47619048 0.57142857 0.57142857 0.57142857 0.61904762
 0.66666667 0.66666667 0.61904762 0.65      ]
Mean test accuracy:  0.5935714285714285


In [41]:
#overall test
print("Training accuracy: ",tree_cv.score(X_train, y_train))
print("Test accuracy: ", tree_cv.score(X_test, y_test))

Training accuracy:  0.7365269461077845
Test accuracy:  0.6220095693779905


## Tree 2

TODO

## Tree 3

TODO