In [1]:
from ipynb.fs.full.prepare_data import *
from sklearn.model_selection import train_test_split
from imblearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import recall_score, precision_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE, ADASYN
import pickle

Loading data...
Attempting to connect to the database
Connected!
Fetching query...
Fetched!
Loading data in dataframe
Data loaded into dataframe!
Class imbalance check for label: title
Counter({'stay_at_home': 46928, 'workplace_closure': 26044, 'contact_tracing': 15082, 'school_closure': 14812, 'restrictions_on_gatherings': 7130, 'intern_travel_control': 2356})
Started data transformation...
Started preprocessing...
Extracting labels and features...
Labels and features extracted!
Encoding categorical features...
Normalizing numerical features...
Index(['gender', 'age_group', 'outbreak_related', 'month_name', 'is_holiday',
       'season', 'is_weekend', 'phu_name', 'retail_and_recreation',
       'grocery_and_pharmacy', 'num_resolved', 'num_unresolved', 'num_fatal'],
      dtype='object')
Preprocessing done!
Data transformation done! Data is ready for training.
Started feature selection...
Selecting top 8 features
Feature selection done!


Tuning tree-specific paramters

1. Tune max_depth and num_samples_split

In [2]:
# Oversample
X_train, y_train = SMOTE().fit_resample(X_train, y_train)

In [5]:
param_test1 = {'n_estimators':range(80,200,10)}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500,min_samples_leaf=50,max_depth=5,max_features='sqrt',subsample=0.8,random_state=10),  param_grid = param_test1, scoring='recall_micro',n_jobs=4,cv=5)
gsearch1.fit(X_train, y_train)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

({'mean_fit_time': array([ 68.41130009,  77.07702699,  85.2432476 ,  91.73504782,
         102.53785481, 111.44201546, 118.05423236, 122.69532971,
         132.82974067, 136.01971183, 145.35739193, 149.13567576]),
  'std_fit_time': array([0.36030499, 0.50220789, 0.33092729, 0.71551028, 0.43739983,
         0.5372518 , 0.71860983, 1.65780004, 2.64564162, 2.91077705,
         1.21896757, 4.66540061]),
  'mean_score_time': array([0.51822801, 0.53459787, 0.61865759, 0.69551744, 0.69722972,
         0.78824787, 0.81176195, 0.87887006, 0.90944109, 0.98175006,
         1.01802449, 1.05664454]),
  'std_score_time': array([0.01944502, 0.03142645, 0.04836122, 0.06985174, 0.01608683,
         0.02337418, 0.0441148 , 0.0746942 , 0.0507358 , 0.04604504,
         0.03945763, 0.12735801]),
  'param_n_estimators': masked_array(data=[80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180,
                     190],
               mask=[False, False, False, False, False, False, False, False,
              

In [6]:
param_test2 = {'max_depth':range(5,16,2), 'min_samples_split':range(200,1001,200)}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, n_estimators=120, max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test2, scoring='f1_micro',n_jobs=5, cv=5)
gsearch2.fit(X_train,y_train)
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_

({'mean_fit_time': array([111.57139721, 107.22424612, 111.08159523, 101.15361371,
          99.01841154, 121.15022879, 118.83457346, 118.62732983,
         117.64716229, 118.08160849, 144.6488142 , 138.94898429,
         136.72068319, 136.92698588, 137.02177963, 177.94800501,
         165.50636973, 159.18648462, 155.66931086, 149.68219385,
         213.18007655, 190.07099414, 176.8119761 , 171.41622539,
         163.76983399, 259.10464163, 216.66850886, 196.45765414,
         185.44639721, 172.91945152]),
  'std_fit_time': array([0.58708581, 0.71889445, 0.90323577, 0.3763327 , 0.52005268,
         0.60814002, 0.52216907, 0.87935473, 1.20224119, 1.4960015 ,
         1.16521031, 1.42848134, 1.76412444, 2.36062344, 1.79231206,
         1.0930156 , 2.13424847, 3.50772683, 2.13532318, 0.8877711 ,
         3.68456927, 2.60093575, 1.6522889 , 1.55956329, 1.6961104 ,
         2.172497  , 2.52411463, 1.0236126 , 2.06081142, 2.48141217]),
  'mean_score_time': array([0.74541759, 0.803335  , 0.786

2. Tune min_samples_leaf

In [None]:
param_test3 = {'min_samples_split':range(1000,2100,200), 'min_samples_leaf':range(30,71,10)}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=60,max_depth=5,max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test3, scoring='f1_micro',n_jobs=5, cv=5)
gsearch3.fit(X_train,y_train)
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_

3. Tune max_features

In [None]:
param_test4 = {'max_features':range(2,8,2)}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=60,max_depth=5, min_samples_split=1000, min_samples_leaf=30, subsample=0.8, random_state=10),
param_grid = param_test4, scoring='f1_micro',n_jobs=4, cv=5)
gsearch4.fit(X_train,y_train)
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_

4. Tuning subsamples

In [None]:
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=60,max_depth=5,min_samples_split=1000, min_samples_leaf=30, max_features=2, random_state=10),
param_grid = param_test5, scoring='f1_micro',n_jobs=5, cv=5)
gsearch5.fit(X_train,y_train)
gsearch5.cv_results_, gsearch5.best_params_, gsearch5.best_score_