## Decision tree for classification 

### Chapter 1: Classification and Regression Tree (CART)
* Sequence of if-else questions about individual features 
* Objective: infer class labels 
* Able to capture non-linear relationships between features and labels 
* Don't need standardirization 

In [None]:
# Classification-tree in scikit-learn 
from sklearn.tree import DecisionTreeClassifier
from sklear.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2, 
                                                    stratify = y,
                                                    random_state = 1)
dt = DecisionTreeClassifier(max_depth = 2, random_state = 1)
dt.fit(X_train, y_train)
y_predict = dt.predict(X_test)
accuracy_score(y_test, y_pred)

### Building blocks of a decision tree
* Decision tree: data structure consisting of a hierarchy nodes
* Node: question or prediction 

### Information gain (IG)

Criteria to measure the impurity of a node I:
* gini index,
* entropy 

### Classification tree learning 
* Nodes are grown recursively 
* At each nodes, split the data based on: 
If IG = 0, declear the node a leaf

In [None]:
dt = DecisionTreeClassifier(criterion = 'gini', random_state = 1)

### Decision tree for Regression

#### Auto-mpg Dataset

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error as MSE
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    random_state = 3)
dt = DecisionTreeRegressor(max_depth = 4, 
                           min_samples_leaf = 0.1,
                           random_state = 3)
dt.fit(X_train, y_train)
y_predict = dt.predict(X_test)
mse_dt = MSE(y_test, y_pred)
rmse_dt = mse_dt ** (1/2)

### Generalization Error 
* Low bias: Accurate
* Low variance: Precise 
* As the complexity of f̂  increases, the bias term decreases while the variance term increases.

In [None]:
# k-fold CV in sklearn on the auto Dataset
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error as MSE
from sklearn.odel_selection import cross_val_score
SEED = 123
# 70% train, 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.3,
                                                    random_state = SEED)
dt = DecisionTreeRegressor(max_depth = 4, 
                           min_sample_leaf = 0.14,
                           random_state = SEED)
MSE_CV = - cross_val_score(dt, X_train, y_train, cv = 10, 
                           scoring = 'neg_mean_squared error', 
                           n_jobs = -1)
dt.fit(X_train, y_train)
y_predict_train = dt.predict(X_train)
y_predict_test = dt.predict(X_test)

### Ensemble learning 
* Train different models on the same dataset
* Let each model make its predictions 
* Meta-model: aggregates predictions of individudal models 
* Final prediction: more robust and less prone to errors

In [None]:
# Voting Classifier in sklearn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClaasifier as KNN
from sklearn.ensamble import VotingClassifier

SEED = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.3,
                                                    random_state = SEED)
lr = LogisticRegression(random_state = SEED)
knn = KNN()
dt = DecisionTreeRegressor(random_state = SEED)
classifier = [('LogisticRegression', lr), 
              ('K Nearest Neighbor', knn),
              ('Classification Tree', dt)]
for clf_name, clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    

### Bagging 
* Bootstrap Aggregation 
* Reduce variance of individual models in the ensamble 

In [None]:
# Bagging classifier in sklearn 
from sklearn.ensamble import BaggingClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split 
SEED = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.3,
                                                    random_state = SEED)
dt = DecisionTreeRegressor(max_depth = 4, 
                           min_sample_leaf = 0.13,
                           random_state = SEED) 
bc = BaggingClassifier(base_estimator = dt, n_estimators = 300, n_jobs = -1)

bc.fit(X_train, y_train)
y_pred = bc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

### Out of bag evaluation (OOB evaluation)

In [None]:
# OOB evaluation in sklearn
from sklearn.ensamble import BaggingClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split 
SEED = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.3,
                                                    stratify = y,
                                                    random_state = SEED)
dt = DecisionTreeRegressor(max_depth = 4, 
                           min_sample_leaf = 0.16,
                           random_state = SEED)
bc = BaggingClassifier(base_estimator = dt, 
                       n_estimators = 300, 
                       oob_score = True, n_jobs = -1)
bc.fit(X_train, y_train)
y_pred = bc.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
oob_accuracy = bc.oob_score_

### Random forests


In [None]:
# Random Forest Regressor in sklearn 
from sklearn.ensamble import RandomForestRegressor
from sklearn.model_seelcion import trian_test_split
from sklearn.metrics import mean_squared_error as MSE
SEED = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.3,
                                                    random_state = SEED)
rf = RandomForestRegressor(n_estimators = 400, 
                           min_sample_leaf = 0.12, 
                           random_state = SEED)
rf.fit(X_train, y_train)
rf.predict(X_test)
rmse_test = MSE(y_test, y_pred)**(1/2)

In [None]:
# Feature importance 
import pandas as pd 
import matplotlib.pyplot as plt
importances_rf = pd.Series(rf_feature_importances_, index = X.columns)
sorted_importances_rf = importances_rf.sort_values()
sorted_importances_rf.plot(kind = 'barh', color = 'lightgreen')
plt.show()

### AdaBoost 
* Stands for Adaptive Bosting
* Achieved by changing the weights of training instance

In [None]:
# AdaBoost Classification in sklearn 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
SEED = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.3,
                                                    stratify = y,
                                                    random_state = SEED)
dt = DecisionTreeClassifier(max_depth = 1, random_state = SEED)
adb_clf. = AdaBoostClassifier(base_estimator = dt, n_estimators = 100)
adb_clf.fit(X_train, y_train)
y_pred_proba = adb_clf.predict_proba(X_test)[:, 1]
adb_clf_roc_auc_score = roc_auc_score(y_test, y_pred_proba)

### Gradient Boosting (GB)
* Gradient Boosted Trees 

In [None]:
# Gradient Boosting in sklearn 
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error as MSE
SEED = 1
# 70% train, 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.3,
                                                    random_state = SEED)
gbt = GradientBoostingRegressor(n_estimators = 300, 
                                max_depth = 1, 
                                random_state = SEED)
gbt.fit(X_train, y_train)
y_pred = gbt.predict(X_test)
rmse_test = MSE(y_test, y_pred)**(1/2)

### Stochastic Gradient Boosting

In [None]:
# Stochastic Gradient Boosting in sklearn 
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error as MSE
SEED = 1
# 70% train, 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.3,
                                                    random_state = SEED)
gbt = GradientBoostingRegressor(n_estimators = 300, 
                                max_depth = 1, 
                                random_state = SEED)
sgbt = GradientBoostingRegressor(max_depth = 1, 
                                 subsample = 0.8, 
                                 max_features = 0.2,
                                 n_estimators = 300, 
                                 random_state = SEED)
sgbt.fit(X_train, y_train)
sgbt_pred = sgbt.predict(X_test)
rmse_test = MSE(y_test, y_pred)**(1/2)

### Tuning a CART's hyperparameters

In [None]:
# Inspecting the hyperparamenters of a CART in sklearn 
from sklearn.tree import DecisionTreeClassifier 
SEED = 1
dt = DecisionTreeClassifier(random_state = SEED)
dt.get_params()

In [None]:
from sklearn.model_selection import GridSearchCV
params_dt = {
    'max_depth': [3, 4, 5, 6],
    'min_samples_leaf': [0.04, 0.06, 0.08],
    'max_features': [0.2, 0.4, 0.6, 0.8]
}
grid_dt = GridSearchCV(estimator = dt, 
                       pram_grid = params_dt,
                       scoring = 'accuracy',
                       cv = 10, 
                       n_jobs = -1)
best_hyperparams = grid_dt.best_pramas_
best_CV_score = grid_dt.best_score_
best_model = grid_dt.best_estimator_
test_acc = best_model.score(X_test, y_test)

### Tuning an RF's Hyperparameters

In [None]:
# Inspecting RF hyperparamenters in sklearn 
from sklearn.ensamble import RandomForestRegressor
SEED = 1
rf = RandomForestRegressor(random_state = SEED)
rf.get_params()

In [None]:
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import GridSearchCV
parms_rf = {
    'n_estimators': [300, 400, 500],
    'max_depth': [4, 6, 8], 
    'min_samples_leaf': [0.1, 0.2],
    'max_features': ['log2', 'sqrt']
}
grid_rf = GridSearchCV(estimator = rf, 
                       pram_grid = params_rf,
                       scoring = 'neg_mean_squared_error',
                       cv = 3, 
                       verbose = 1,
                       n_jobs = -1)
grid_rf.fit(X_train, y_train)
best.hyperparams = grid_rf.best_params_
best_model = grid_rf.best_estimator_
y_pred = best_model.predict(X_test)
rmse_test = MSE(y_test, y_pred)**(1/2)