# Model Evaluation 2 - Learning Curves

---

## Setup

In [1]:
import re
import os
import sys
import time
import joblib 

import numpy as np
import pandas as pd
import scipy.sparse as sp
import matplotlib.pyplot as plt

from datetime import datetime
from sklearn.model_selection import train_test_split, \
    ShuffleSplit, StratifiedKFold, learning_curve
from sklearn.metrics import make_scorer, accuracy_score, \
    recall_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, \
    RandomForestClassifier, GradientBoostingClassifier, \
    VotingClassifier

import custom.evaluate_models as E

np.set_printoptions(threshold=sys.maxsize)
dt_object = datetime.fromtimestamp(time.time())
day, T = str(dt_object).split('.')[0].split(' ')
print('Revised on: ' + day)

Revised on: 2021-02-21


## Load Test Target

In [2]:
def load_y(data):
    raw_path = os.path.join("data","1_raw")
    filename = ''.join([data, ".csv"])
    out_dfm = pd.read_csv(os.path.join(raw_path, filename))
    out_arr = np.array(out_dfm.iloc[:,0].ravel())
    return out_arr

y_test = load_y("y_test") 

def make_int(y_array):
    y = y_array.copy()
    y[y=='ham'] = 0
    y[y=='spam'] = 1
    y = y.astype('int')
    return y

y = make_int(y_test)

## Load Preprocessed Test Set

In [3]:
def load_X(filename):
    proc_dir = os.path.join("data", "2_processed")
    filename = ''.join([filename, '.npz'])
    X = sp.load_npz(os.path.join(proc_dir, filename))
    return X

X = load_X('X_test_processed')

## Instantiate Candidate Models

In [9]:
rnd_clf1 = RandomForestClassifier(
    random_state=42, n_estimators=100, max_features=150, 
    max_depth=8, min_samples_split=3, warm_start=True, 
    n_jobs=1)

rnd_clf2 = RandomForestClassifier(
    random_state=42, n_estimators=100, max_features=300, 
    max_depth=8, min_samples_split=3, warm_start=True, 
    n_jobs=1)
    
ada_clf1 =  AdaBoostClassifier(
    random_state=42 , n_estimators=10, 
    learning_rate=0.001)

gbc1a = GradientBoostingClassifier(
    random_state=42, n_estimators=50, max_features=None, 
    max_depth=1, min_samples_split=2)

gbc2a = GradientBoostingClassifier(
    random_state=42, n_estimators=100, max_features=300, 
    max_depth=8, min_samples_split=5)

gbc2c = GradientBoostingClassifier(
    random_state=42, n_estimators=50, max_features=300, 
    max_depth=3, min_samples_split=5)

## 10-fold Cross Validation Learning Plots

In [10]:
# 10-fold cross validation
cv = ShuffleSplit(n_splits=10, test_size=0.25, random_state=42)

In [14]:
E.compare_two_classifiers(X, y, rnd_clf1, rnd_clf2, "rnd_clf1", "rnd_clf2", cv)

In [None]:
E.compare_two_classifiers(X, y, ada_clf1, gbc1a, "ada_clf1", "gbc1a", cv)

In [None]:
E.compare_two_classifiers(X, y, gbc2a, gbc2c, "gbc2a", "gbc2c", cv)