# Lab 8

## TODO:

- [x] Split into training and test set (stratified)
- [x] Decision tree on dataset
- [ ] Add option to run on the smile dataset (optional for more consistent results)
- [x] Make table (plot accuracy vs some hyperparams)
  - [x] 10-fold CV
  - [x] Accuracy
  - [-] TP rate
  - [-] FP rate
  - [-] precision
  - [-] recall
  - [x] F measure
  - [ ] ROC area
  - [ ] Graph table
- [x] Random search hyperparameters
- [x] 3 trees with different train/test sets (optional, exact details unclear)

## Notes:
- When doing k-fold validation make sure that the folds are stratified

## Imports 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import os

## Data setup

In [2]:
np.random.seed(0)
os.environ["OMP_NUM_THREADS"] = "5"

df3 = pd.read_csv("Data/data3.csv")
df3

Unnamed: 0,failures,higher,studytime,Medu,Fedu,Dalc,age,reason_reputation,school,address,internet,G3
0,0,1,2,4,4,1,18,0,0,0,0,6
1,0,1,2,1,1,1,17,0,0,0,1,6
2,3,1,2,1,1,2,15,0,0,0,1,10
3,0,1,3,4,2,1,15,0,0,0,1,15
4,0,1,2,3,3,1,16,0,0,0,0,10
...,...,...,...,...,...,...,...,...,...,...,...,...
1039,1,1,3,2,3,1,19,0,1,1,1,10
1040,0,1,2,3,1,1,18,0,1,0,1,16
1041,0,1,2,1,1,1,18,0,1,0,0,9
1042,0,1,1,3,1,3,17,0,1,0,1,10


In [3]:
df3_copy = df3.copy()
y = df3_copy.pop("G3")
y_binary = [0 if y_i > np.mean(y) else 1 for y_i in y] #making the target class into binary (over/under average score)
X = df3_copy
# display(X)
# print(y_binary)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, stratify=y_binary, random_state=42,)
X_train.shape

(835, 11)

Note: No need to scale, trees are not affected by different scales.

## Tree 1

In [5]:
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [6]:
parameters = {"max_depth": [3, None],
              "max_features": randint(1, X.shape[1]),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

clf = DecisionTreeClassifier()
tree_cv = RandomizedSearchCV(clf, parameters, cv=10, random_state = 10)
tree_cv.fit(X_train,y_train)


In [7]:
#takes a few seconds and hard to see but just for sanity check
# tree.plot_tree(tree_cv.best_estimator_)

## Evaluation

### Training results

In [8]:
print("Best Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

Best Decision Tree Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 7, 'min_samples_leaf': 7}
Best score is 0.6633820998278829


In [9]:
# tree_cv.cv_results_

In [10]:
table = pd.DataFrame(tree_cv.cv_results_["params"])
table["classifier"] = "DecisionTreeClassifier"
table["test_method"] = "10CV"
table["accuracy"] = tree_cv.cv_results_["mean_test_score"]
table = table.sort_values(by=["accuracy"], ascending=False)

#### Test set results

In [11]:
#cross validation

CV_test = cross_val_score(tree_cv, X_test, y_test, cv=10)
print(CV_test)
print("Mean test accuracy: ",np.mean(CV_test))

[0.52380952 0.47619048 0.57142857 0.57142857 0.57142857 0.61904762
 0.66666667 0.66666667 0.61904762 0.65      ]
Mean test accuracy:  0.5935714285714285


In [12]:
#overall test
print("Training accuracy: ",tree_cv.score(X_train, y_train))
print("Test accuracy: ", tree_cv.score(X_test, y_test))

Training accuracy:  0.7365269461077845
Test accuracy:  0.6220095693779905


## Tree 1

In [13]:
best_tree = tree_cv.best_estimator_

test_accs = []
train_accs = []
fscores = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, stratify=y_binary, random_state=42,)
    best_tree.fit(X_train,y_train)
    test_accs.append(best_tree.score(X_test, y_test))
    train_accs.append(best_tree.score(X_train, y_train))
    y_pred = tree_cv.best_estimator_.predict(X_test)
    fscores.append(f1_score(y_test, y_pred))

test_acc = sum(test_accs) / len(test_accs)
train_acc = sum(train_accs) / len(train_accs)
fscore = sum(fscores)/len(fscores)

row = pd.Series({k: best_tree.get_params()[k] for k in ["criterion","max_depth","max_features","min_samples_leaf"]} | {"classifier": "DecisionTreeClassifier", "test_method": "20% Test Split", "accuracy": test_acc, "train_accuracy": train_acc, "test_fscore": fscore})
table = pd.concat([table, row.to_frame().T], ignore_index=True).sort_values(by=["accuracy"], ascending=False)

## Tree 2

In [14]:
test_accs = []
train_accs = []
fscores = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.5, stratify=y_binary, random_state=42,)
    best_tree.fit(X_train,y_train)
    test_accs.append(best_tree.score(X_test, y_test))
    train_accs.append(best_tree.score(X_train, y_train))
    y_pred = tree_cv.best_estimator_.predict(X_test)
    fscores.append(f1_score(y_test, y_pred))

test_acc = sum(test_accs) / len(test_accs)
train_acc = sum(train_accs) / len(train_accs)
fscore = sum(fscores)/len(fscores)

row = pd.Series({k: best_tree.get_params()[k] for k in ["criterion","max_depth","max_features","min_samples_leaf"]} | {"classifier": "DecisionTreeClassifier", "test_method": "50% Test Split", "accuracy": test_acc, "train_accuracy": train_acc, "test_fscore": fscore})
table = pd.concat([table, row.to_frame().T], ignore_index=True).sort_values(by=["accuracy"], ascending=False)

## Tree 3

In [15]:
test_accs = []
train_accs = []
fscores = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.8, stratify=y_binary, random_state=42,)
    best_tree.fit(X_train,y_train)
    test_accs.append(best_tree.score(X_test, y_test))
    train_accs.append(best_tree.score(X_train, y_train))
    y_pred = tree_cv.best_estimator_.predict(X_test)
    fscores.append(f1_score(y_test, y_pred))

test_acc = sum(test_accs) / len(test_accs)
train_acc = sum(train_accs) / len(train_accs)
fscore = sum(fscores)/len(fscores)

row = pd.Series({k: best_tree.get_params()[k] for k in ["criterion","max_depth","max_features","min_samples_leaf"]} | {"classifier": "DecisionTreeClassifier", "test_method": "80% Test Split", "accuracy": test_acc, "train_accuracy": train_acc, "test_fscore": fscore})
table = pd.concat([table, row.to_frame().T], ignore_index=True).sort_values(by=["accuracy"], ascending=False)

## Forrest

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, stratify=y_binary, random_state=42,)

parameters = {"max_depth": [3, None],
              "max_features": randint(1, X.shape[1]),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

clf = RandomForestClassifier()
forrest_cv = RandomizedSearchCV(clf, parameters, cv=10, random_state = 10)

forrest_cv.fit(X_train,y_train)

In [17]:
tabel_temp = pd.DataFrame(forrest_cv.cv_results_["params"])
tabel_temp["classifier"] = "RandomForrestClassifier"
tabel_temp["test_method"] = "10CV"
tabel_temp["accuracy"] = forrest_cv.cv_results_["mean_test_score"]
table = pd.concat([table, tabel_temp], ignore_index=True)
table = table.sort_values(by=["accuracy"], ascending=False)

In [18]:
best_tree = forrest_cv.best_estimator_

test_accs = []
train_accs = []
fscores = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, stratify=y_binary, random_state=42,)
    best_tree.fit(X_train,y_train)
    test_accs.append(best_tree.score(X_test, y_test))
    train_accs.append(best_tree.score(X_train, y_train))
    y_pred = tree_cv.best_estimator_.predict(X_test)
    fscores.append(f1_score(y_test, y_pred))

test_acc = sum(test_accs) / len(test_accs)
train_acc = sum(train_accs) / len(train_accs)
fscore = sum(fscores)/len(fscores)

row = pd.Series({k: best_tree.get_params()[k] for k in ["criterion","max_depth","max_features","min_samples_leaf"]} | {"classifier": "RandomForrestClassifier", "test_method": "20% Test Split", "accuracy": test_acc, "train_accuracy": train_acc, "test_fscore": fscore})
table = pd.concat([table, row.to_frame().T], ignore_index=True).sort_values(by=["accuracy"], ascending=False)

In [19]:
test_accs = []
train_accs = []
fscores = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.5, stratify=y_binary, random_state=42,)
    best_tree.fit(X_train,y_train)
    test_accs.append(best_tree.score(X_test, y_test))
    train_accs.append(best_tree.score(X_train, y_train))
    y_pred = tree_cv.best_estimator_.predict(X_test)
    fscores.append(f1_score(y_test, y_pred))

test_acc = sum(test_accs) / len(test_accs)
train_acc = sum(train_accs) / len(train_accs)
fscore = sum(fscores)/len(fscores)

row = pd.Series({k: best_tree.get_params()[k] for k in ["criterion","max_depth","max_features","min_samples_leaf"]} | {"classifier": "RandomForrestClassifier", "test_method": "50% Test Split", "accuracy": test_acc, "train_accuracy": train_acc, "test_fscore": fscore})
table = pd.concat([table, row.to_frame().T], ignore_index=True).sort_values(by=["accuracy"], ascending=False)

In [20]:
test_accs = []
train_accs = []
fscores = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.8, stratify=y_binary, random_state=42,)
    best_tree.fit(X_train,y_train)
    test_accs.append(best_tree.score(X_test, y_test))
    train_accs.append(best_tree.score(X_train, y_train))
    y_pred = tree_cv.best_estimator_.predict(X_test)
    fscores.append(f1_score(y_test, y_pred))

test_acc = sum(test_accs) / len(test_accs)
train_acc = sum(train_accs) / len(train_accs)
fscore = sum(fscores)/len(fscores)

row = pd.Series({k: best_tree.get_params()[k] for k in ["criterion","max_depth","max_features","min_samples_leaf"]} | {"classifier": "RandomForrestClassifier", "test_method": "80% Test Split", "accuracy": test_acc, "train_accuracy": train_acc, "test_fscore": fscore})
table = pd.concat([table, row.to_frame().T], ignore_index=True).sort_values(by=["accuracy"], ascending=False)

## Table

In [21]:
print("Binary Class split: ", sum(y_binary)/len(y_binary))
table

Binary Class split:  0.5114942528735632


Unnamed: 0,criterion,max_depth,max_features,min_samples_leaf,classifier,test_method,accuracy,train_accuracy,test_fscore
0,gini,3.0,2,4,RandomForrestClassifier,20% Test Split,0.695215,0.69509,0.64
1,gini,3.0,2,4,RandomForrestClassifier,50% Test Split,0.681226,0.698851,0.623423
2,gini,3.0,2,4,RandomForrestClassifier,10CV,0.671801,,
3,gini,3.0,5,4,RandomForrestClassifier,10CV,0.669478,,
4,entropy,3.0,2,1,RandomForrestClassifier,10CV,0.66714,,
5,entropy,,7,7,DecisionTreeClassifier,10CV,0.663382,,
6,gini,,2,3,RandomForrestClassifier,10CV,0.659854,,
7,gini,,1,3,RandomForrestClassifier,10CV,0.658649,,
8,gini,3.0,7,1,RandomForrestClassifier,10CV,0.657487,,
9,entropy,,5,8,RandomForrestClassifier,10CV,0.65743,,
