In [3]:
# General imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle


# Modelling Imports
import sklearn
import sklearn.tree
import sklearn.metrics
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import sklearn.ensemble
import sklearn.preprocessing
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV

# Useful relative paths
data_directory = './data/NELA'
model_directory ='./models'

# 1. – Modelling! 

Run this if runtime lost X and y

In [4]:
X = pd.read_csv('{}/complete_processed.csv'.format(data_directory))
y = X['label']
X.drop('label',axis=1)
;

''

## 1.0 – Vectorize input with Term Frequency Identification (TFID)

In [5]:
feature_pipeline = Pipeline([
                             ('vect', TfidfVectorizer()), 
])
X_vectorized = feature_pipeline.fit_transform(X['full_preprocessed'])

In [6]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X_vectorized, y, test_size=0.2, random_state=0)

## 1.1 – Random Forest 

### 1.1.2 – Random Forest Hyperparameter Search

In [31]:
param_grid = {'n_estimators': np.arange(100,1100,100),
               'max_features': ['auto', 'sqrt'],
               'max_depth': np.arange(5,105,10),
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4]}
random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_grid, n_jobs=-1, cv=5, 
                                   verbose=3, random_state=0 )

In [32]:
%%time
random_search.fit(X_val,y_val)
best_forest=random_search.best_estimator_
y_pred = best_forest.predict(X_test)
print('New Model:')
print(classification_report(y_test, y_pred))
print(random_search.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 15.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 24.0min finished


New Model:
              precision    recall  f1-score   support

           0       0.79      0.85      0.82      4422
           1       0.84      0.77      0.80      4374

    accuracy                           0.81      8796
   macro avg       0.81      0.81      0.81      8796
weighted avg       0.81      0.81      0.81      8796

{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 95}
CPU times: user 2min 6s, sys: 2.71 s, total: 2min 9s
Wall time: 26min 15s


#### Previous best result: Decide if we save this new model

In [34]:
filename = '{}/sklearn/best_random_forest.pkl'.format(model_directory)
prev_best = pickle.load(open(filename, 'rb'))
y_pred =prev_best.predict(X_test)
print('Previous Model:')
print(classification_report(y_test, y_pred))
print(prev_best)

Previous Model:
              precision    recall  f1-score   support

           0       0.79      0.85      0.82      4422
           1       0.84      0.77      0.80      4374

    accuracy                           0.81      8796
   macro avg       0.81      0.81      0.81      8796
weighted avg       0.81      0.81      0.81      8796

RandomForestClassifier(max_depth=95, max_features='sqrt', min_samples_leaf=2,
                       n_estimators=800)


#### ! Danger, uncomment to save the new model best_forest

In [33]:
#filename = '{}/sklearn/best_random_forest.pkl'.format(model_directory)
#pickle.dump(best_forest, open(filename, 'wb'))

### 1.1.2 – Display Current Best Random Forest

In [48]:
%%time
filename = '{}/sklearn/best_random_forest.pkl'.format(model_directory)
random_forest_classifier = pickle.load(open(filename, 'rb'))
y_pred =random_forest_classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.85      0.82      4422
           1       0.84      0.77      0.80      4374

    accuracy                           0.81      8796
   macro avg       0.81      0.81      0.81      8796
weighted avg       0.81      0.81      0.81      8796

CPU times: user 9.14 s, sys: 1.35 s, total: 10.5 s
Wall time: 18.6 s


## 1.2 XGBoost Classification 

### 1.2.1 – Hyperparameter Search

In [35]:
params = {
        'learning_rate':[0.02,0.06,0.1], 
        'n_estimators':[600,800,1000],
        'min_child_weight': [1, 5],
        'gamma': [ 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [4,8,12]
        }
random_search = RandomizedSearchCV(xgboost.XGBClassifier(), param_distributions=params, n_jobs=-1, cv=4, 
                                   verbose=3, random_state=0 )

In [38]:
%%time
random_search.fit(X_val,y_val)
best_xgb=random_search.best_estimator_
y_pred = best_xgb.predict(X_test)
print(classification_report(y_test, y_pred))
print(random_search.best_params_)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 63.1min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed: 99.3min finished


              precision    recall  f1-score   support

           0       0.83      0.84      0.83      4422
           1       0.83      0.82      0.83      4374

    accuracy                           0.83      8796
   macro avg       0.83      0.83      0.83      8796
weighted avg       0.83      0.83      0.83      8796

{'subsample': 0.8, 'n_estimators': 1000, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.8}
CPU times: user 12min 58s, sys: 9.63 s, total: 13min 8s
Wall time: 1h 42min 49s


#### Previous best result: Decide if we save this new model

In [None]:
filename = '{}/sklearn/best_xgb.pkl'.format(model_directory)
prev_best = pickle.load(open(filename, 'rb'))
y_pred =prev_best.predict(X_test)
print(classification_report(y_test, y_pred))
print(prev_best)

#### ! Danger, uncomment to save the new model best_xgb

In [39]:
#filename = '{}/sklearn/best_xgb.pkl'.format(model_directory)
#pickle.dump(best_xgb, open(filename, 'wb'))

### 1.2.2 – Display current best XGB Boost

In [43]:
%%time
filename = '{}/sklearn/best_xgb.pkl'.format(model_directory)
prev_best = pickle.load(open(filename, 'rb'))
y_pred =prev_best.predict(X_test)
print(classification_report(y_test, y_pred))
print(prev_best)

              precision    recall  f1-score   support

           0       0.83      0.84      0.83      4422
           1       0.83      0.82      0.83      4374

    accuracy                           0.83      8796
   macro avg       0.83      0.83      0.83      8796
weighted avg       0.83      0.83      0.83      8796

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=1, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)
CPU times: user 1.02 s, sys: 68.2 ms, total: 1.08 s
Wall time: 787 ms
