# Gridsearch with H2O AutoML

- Implements H20 AutoML package, which makes it relatively (but not that more) easier to implement grid search
- Dataset in this model is aggregated by day with Brandwatch avg daily sentiment as target

# Installs and Imports

In [1]:
!pip install requests
!pip install tabulate
!pip install "colorama>=0.3.8"
!pip install future

!pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o

import h2o

h2o.init()

In [1]:
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import seaborn as sns
import re


from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import SparsePCA
import spacy
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
from scipy import sparse
from h2o.automl import H2OAutoML



# Reading in and Shaping data

In [2]:
#reads in text data
text = pd.read_csv('/floyd/home/Capstone/cap_notebooks/data/master_data_set/text_with_tokens_52k.csv')
#convert date to datetime object
text['date'] = pd.to_datetime(text['date'])

#create day groupby object
grouped_text = text.groupby([text['date'].dt.year, text['date'].dt.month, text['date'].dt.day])

#aggregating tokens by day
text_day_grouped = grouped_text['text_token'].agg(lambda column: "".join(column))

#set as df
text_day_grouped = pd.DataFrame(text_day_grouped)

#rename index
text_day_grouped = text_day_grouped.rename_axis(index=['year', 'month', 'day'])

#reset_index
text_day_grouped = text_day_grouped.reset_index()

#converts dates to datetime
text_day_grouped['date_grouped'] = pd.to_datetime(text_day_grouped[['year', 'month', 'day']])

In [3]:
#checking original dataset
text.head()

Unnamed: 0.1,Unnamed: 0,gkgcode,date,link,tone,title,authors,pub_date,text,date_str,Tone_only,polarity,text_token
0,0,20150302100000-674,2015-03-02,http://www.nationalreview.com/article/414611/a...,"0.350631136044881,2.73492286115007,2.384291725...",Is America a ‘Clean Energy’ Laggard?,"['Robert Bryce', 'Victor Davis Hanson', 'Isaac...",2015-03-02 04:00:00+00:00,"The answer is not only “No,” but a resounding ...",20150302,0.350631,5.119215,"['answer', 'resounding', 'myriad', 'claim', 'e..."
1,3,20150302153000-229,2015-03-02,http://www.latimes.com/business/hiltzik/la-fi-...,"-0.952380952380953,3.49206349206349,4.44444444...",Watch ‘Meet the Press’ treat climate change as...,"['Business Columnist', 'Los Angeles Times Colu...",2015-03-01 00:00:00,"As you may have heard, Sen. James Inhofe (R-Ok...",20150302,-0.952381,7.936508,"['hear', 'sen.', 'james', 'inhofe', 'r', 'okla..."
2,6,20150302163000-237,2015-03-02,http://www.usatoday.com/story/news/nation-now/...,"0,1.8140589569161,1.8140589569161,3.6281179138...",,[],2015-03-02 00:00:00,Mary Bowerman USA TODAY Network Visitors sho...,20150302,0.0,3.628118,"['mary', 'bowerman', 'usa', 'today', 'network'..."
3,4,20150302180000-1352,2015-03-02,http://www.nytimes.com/2015/03/03/business/int...,"-1.14754098360656,1.80327868852459,2.950819672...",Russian Energy Deal Comes at Contentious Time,['Stanley Reed'],2015-03-03 00:00:00,But Mr. Fridman has a business track record th...,20150302,-1.147541,4.754098,"['mr.', 'fridman', 'business', 'track', 'recor..."
4,2,20150302203000-163,2015-03-02,http://www.cbsnews.com/news/did-climate-change...,"-8.0545229244114,0.371747211895911,8.426270136...",Did climate change cause the Syrian civil war?,"['Michael Casey', 'Michael Casey Covers The En...",,Climate change sparked a historic drought in S...,20150302,-8.054523,8.798017,"['climate', 'change', 'spark', 'historic', 'dr..."


In [8]:
#reviewing resampled text dataset
text_day_grouped.head()

Unnamed: 0,year,month,day,text_token,date_grouped
0,2015,3,2,"['answer', 'resounding', 'myriad', 'claim', 'e...",2015-03-02
1,2015,3,3,"['scientist', 'center', 'controversy', 'fossil...",2015-03-03
2,2015,3,4,"['scientist', 'step', 'closer', 'understand', ...",2015-03-04
3,2015,3,5,"['high', 'blessed', 'relief', 'finally', 'pres...",2015-03-05
4,2015,3,6,"['california', 'lead', 'nation', 'take', 'acti...",2015-03-06


In [9]:
#reading in Brandwatch twitter sentiment data
sentiment = pd.read_csv('/floyd/home/Capstone/cap_notebooks/data/brandwatch/bw_sentiment_emotion_day/bw_sentiment_2018-2020.csv')

#drop unnecessary col
sentiment.drop('Unnamed: 0', axis=1, inplace=True)

#set date to datetime
sentiment['days'] = pd.to_datetime(sentiment['days'])

In [10]:
#review sentiment data
sentiment.head()

Unnamed: 0,days,sentiment
0,2018-10-05,-1.119873
1,2018-10-06,-0.847089
2,2018-10-07,-1.485399
3,2018-10-08,-0.894346
4,2018-10-09,-0.762045


In [11]:
#binarizing sentiment on -1.48 mean value
sentiment['binary_sentiment'] = np.where(sentiment['sentiment'] >= -1.52, 1, 0)

In [12]:
#join text with sentiment data on date
x_y_complete = sentiment.merge(text_day_grouped, how='inner',  left_on='days', right_on='date_grouped')

In [13]:
#review complete dataset
x_y_complete.head()

Unnamed: 0,days,sentiment,binary_sentiment,year,month,day,text_token,date_grouped
0,2018-10-05,-1.119873,1,2018,10,5,"['kuala', 'lumpur', 'oct', '4', 'thomson', 're...",2018-10-05
1,2018-10-06,-0.847089,1,2018,10,6,"['past', 'couple', 'week', 'see', 'mr.', 'trum...",2018-10-06
2,2018-10-07,-1.485399,1,2018,10,7,"['couple', 'contact', 'december', '2016', 'was...",2018-10-07
3,2018-10-08,-0.894346,1,2018,10,8,"['cheltenham', 'england', 'thomson', 'reuters'...",2018-10-08
4,2018-10-09,-0.762045,1,2018,10,9,"['stockholm', 'reuters', 'americans', 'william...",2018-10-09


In [14]:
#setting X_Y
X = x_y_complete['text_token']
y = x_y_complete['binary_sentiment']

# Preparing Data for H2O

In [15]:
#test train split
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=.3, stratify=y)
print(f'Split done - X_train shape: {X_train.shape}, X_test shape: {X_test.shape}, y_train shape: {y_train.shape}, y_test shape: {y_test.shape}')

#create tf-idf vectorizer
bagofwords = TfidfVectorizer(min_df=5)
print('vectorizer done')

#fit vectorizer
print('beginng vectorizer fitting')
bagofwords.fit(X_train)
print('vectorizer fitting complete')


#transform X_train
print('beginning transformation')
X_train_transformed = bagofwords.transform(X_train)
print('X_train transformed')

#transform X_test
X_test_transformed = bagofwords.transform(X_test)
print('X_test_transformed')

Split done - X_train shape: (385,), X_test shape: (166,), y_train shape: (385,), y_test shape: (166,)
vectorizer done
beginng vectorizer fitting
vectorizer fitting complete
beginning transformation
X_train transformed
X_test_transformed


In [16]:
#checking transformed X_train
X_train_transformed

<385x33781 sparse matrix of type '<class 'numpy.float64'>'
	with 1587187 stored elements in Compressed Sparse Row format>

In [17]:
#joining target data with sparse matrices, which is what H2o requres
train = sparse.hstack((X_train_transformed, np.array(y_train)[:,None]))
test = sparse.hstack((X_test_transformed, np.array(y_test)[:,None]))

In [18]:
train.shape

(385, 33782)

In [19]:
type(train)

scipy.sparse.coo.coo_matrix

In [20]:
test.shape

(166, 33782)

In [21]:
#convert data to H2O special data structure
train_h2o = h2o.H2OFrame(train)
test_h2o = h2o.H2OFrame(test)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [23]:
#defining features and target for H2O for train data
y_train_h2o = train_h2o.col_names[-1]
X_train_h2o = train_h2o.col_names[:33686]

In [24]:
#defining features and target for H2O for train data
y_test_h2o = test_h2o.col_names[-1]
X_test_h2o = test_h2o.col_names[:33686]

In [25]:
#converting target to factor for H20
train_h2o[y_train_h2o] = train_h2o[y_train_h2o].asfactor()
test_h2o[y_train_h2o] = test_h2o[y_train_h2o].asfactor()

# Modeling

- Most successful model: Logistic Regression 
- AUC: .65
- Train acc: 65 percent
- Test acc: 66 percent

In [27]:
#define H20 automl object to try up to 50 models, and max runtime of 1000 secs
aml = H2OAutoML(max_models=50, max_runtime_secs=1000, balance_classes=True)

#training 
aml.train(x=X_train_h2o, y=y_train_h2o, training_frame=train_h2o)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [28]:
#define leaderboard object which contains metrics for each model
lb = aml.leaderboard

In [29]:
#display top models
lb.head(rows=lb.nrows)

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
GLM_1_AutoML_20201013_191349,0.65418,0.661391,0.605217,0.373334,0.483899,0.234158
StackedEnsemble_AllModels_AutoML_20201013_191349,0.636099,0.670212,0.574238,0.375951,0.487927,0.238072
StackedEnsemble_BestOfFamily_AutoML_20201013_191349,0.636099,0.670212,0.574238,0.375951,0.487927,0.238072
XGBoost_2_AutoML_20201013_191349,0.566983,0.694695,0.536393,0.481513,0.500244,0.250244
XGBoost_3_AutoML_20201013_191349,0.518999,0.709269,0.512468,0.5,0.507761,0.257821
GBM_2_AutoML_20201013_191349,0.50927,0.695728,0.50574,0.5,0.501278,0.251279
XGBoost_1_AutoML_20201013_191349,0.489853,0.711803,0.499562,0.5,0.508986,0.259067
GBM_1_AutoML_20201013_191349,0.469248,0.701837,0.474458,0.5,0.504297,0.254315
DRF_1_AutoML_20201013_191349,0.464862,18.4805,0.480471,0.5,0.731481,0.535065




In [32]:
#show best model
aml.leader

Model Details
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  GLM_1_AutoML_20201013_191349


GLM Model: summary


Unnamed: 0,Unnamed: 1,family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
0,,binomial,logit,Ridge ( lambda = 12.305 ),"nlambda = 30, lambda.max = 12.305, lambda.min = 12.305, lambda.1se...",33686,33686,0,automl_training_py_3_sid_afbc




ModelMetricsBinomialGLM: glm
** Reported on train data. **

MSE: 0.040601830807426534
RMSE: 0.20149895981723215
LogLoss: 0.22096190961295054
Null degrees of freedom: 384
Residual degrees of freedom: -33302
Null deviance: 533.6999521712102
Residual deviance: 170.1406704019719
AIC: 67544.14067040198
AUC: 1.0
AUCPR: 1.0
Gini: 1.0

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.6564576804770559: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,191.0,0.0,0.0,(0.0/191.0)
1,1,0.0,194.0,0.0,(0.0/194.0)
2,Total,191.0,194.0,0.0,(0.0/385.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.656458,1.0,193.0
1,max f2,0.656458,1.0,193.0
2,max f0point5,0.656458,1.0,193.0
3,max accuracy,0.656458,1.0,193.0
4,max precision,0.901071,1.0,0.0
5,max recall,0.656458,1.0,193.0
6,max specificity,0.901071,1.0,0.0
7,max absolute_mcc,0.656458,1.0,193.0
8,max min_per_class_accuracy,0.656458,1.0,193.0
9,max mean_per_class_accuracy,0.656458,1.0,193.0



Gains/Lift Table: Avg response rate: 50.39 %, avg score: 50.29 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.01039,0.873412,1.984536,1.984536,1.0,0.888342,1.0,0.888342,0.020619,0.020619,98.453608,98.453608,0.020619
1,2,0.020779,0.863906,1.984536,1.984536,1.0,0.868898,1.0,0.87862,0.020619,0.041237,98.453608,98.453608,0.041237
2,3,0.031169,0.858321,1.984536,1.984536,1.0,0.860346,1.0,0.872529,0.020619,0.061856,98.453608,98.453608,0.061856
3,4,0.041558,0.854854,1.984536,1.984536,1.0,0.856656,1.0,0.868561,0.020619,0.082474,98.453608,98.453608,0.082474
4,5,0.051948,0.851473,1.984536,1.984536,1.0,0.852912,1.0,0.865431,0.020619,0.103093,98.453608,98.453608,0.103093
5,6,0.101299,0.836721,1.984536,1.984536,1.0,0.844889,1.0,0.855423,0.097938,0.201031,98.453608,98.453608,0.201031
6,7,0.150649,0.826535,1.984536,1.984536,1.0,0.831652,1.0,0.847636,0.097938,0.298969,98.453608,98.453608,0.298969
7,8,0.2,0.819018,1.984536,1.984536,1.0,0.822109,1.0,0.841337,0.097938,0.396907,98.453608,98.453608,0.396907
8,9,0.301299,0.800327,1.984536,1.984536,1.0,0.809499,1.0,0.830633,0.201031,0.597938,98.453608,98.453608,0.597938
9,10,0.4,0.775411,1.984536,1.984536,1.0,0.789021,1.0,0.820365,0.195876,0.793814,98.453608,98.453608,0.793814




ModelMetricsBinomialGLM: glm
** Reported on cross-validation data. **

MSE: 0.23415807123674667
RMSE: 0.4838988233471401
LogLoss: 0.6613912273786693
Null degrees of freedom: 384
Residual degrees of freedom: -33300
Null deviance: 535.945754700407
Residual deviance: 509.2712450815754
AIC: 67879.27124508157
AUC: 0.6541803853834943
AUCPR: 0.6052170943785665
Gini: 0.30836077076698865

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.4022921684608267: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,73.0,118.0,0.6178,(118.0/191.0)
1,1,25.0,169.0,0.1289,(25.0/194.0)
2,Total,98.0,287.0,0.3714,(143.0/385.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.402292,0.702703,286.0
1,max f2,0.2136,0.836207,383.0
2,max f0point5,0.459789,0.64632,229.0
3,max accuracy,0.459789,0.646753,229.0
4,max precision,0.870085,1.0,0.0
5,max recall,0.2136,1.0,383.0
6,max specificity,0.870085,1.0,0.0
7,max absolute_mcc,0.459789,0.297701,229.0
8,max min_per_class_accuracy,0.495083,0.613402,188.0
9,max mean_per_class_accuracy,0.459789,0.646003,229.0



Gains/Lift Table: Avg response rate: 50.39 %, avg score: 49.19 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.01039,0.78365,1.488402,1.488402,0.75,0.811339,0.75,0.811339,0.015464,0.015464,48.840206,48.840206,0.010228
1,2,0.020779,0.750614,0.496134,0.992268,0.25,0.764665,0.5,0.788002,0.005155,0.020619,-50.386598,-0.773196,-0.000324
2,3,0.031169,0.72575,0.496134,0.82689,0.25,0.736885,0.416667,0.770963,0.005155,0.025773,-50.386598,-17.310997,-0.010876
3,4,0.041558,0.707011,0.992268,0.868235,0.5,0.714232,0.4375,0.756781,0.010309,0.036082,-0.773196,-13.176546,-0.011038
4,5,0.051948,0.690719,0.992268,0.893041,0.5,0.695487,0.45,0.744522,0.010309,0.046392,-0.773196,-10.695876,-0.0112
5,6,0.101299,0.651245,1.46229,1.170367,0.736842,0.67305,0.589744,0.709702,0.072165,0.118557,46.228974,17.036743,0.034787
6,7,0.150649,0.622211,1.46229,1.265997,0.736842,0.636393,0.637931,0.685687,0.072165,0.190722,46.228974,26.599716,0.080774
7,8,0.2,0.600577,1.253391,1.262887,0.631579,0.609559,0.636364,0.666902,0.061856,0.252577,25.339121,26.28866,0.10598
8,9,0.301299,0.565328,1.068596,1.197565,0.538462,0.580348,0.603448,0.637802,0.108247,0.360825,6.859635,19.756488,0.119987
9,10,0.4,0.524354,1.514514,1.275773,0.763158,0.543104,0.642857,0.614435,0.149485,0.510309,51.451438,27.57732,0.222351




Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.64155847,0.04267968,0.64935064,0.5974026,0.5974026,0.6883117,0.6753247
1,auc,0.65344536,0.06317549,0.62635136,0.6145833,0.58986485,0.74358976,0.6928375
2,aucpr,0.6196122,0.1179298,0.5931685,0.45215598,0.5899048,0.7538463,0.7089854
3,err,0.35844156,0.04267968,0.35064936,0.4025974,0.4025974,0.3116883,0.32467532
4,err_count,27.6,3.2863352,27.0,31.0,31.0,24.0,25.0
5,f0point5,0.6387844,0.055315025,0.65068495,0.5555556,0.6168831,0.6722689,0.6985294
6,f1,0.7143141,0.042380735,0.7378641,0.6436782,0.71028036,0.72727275,0.75247526
7,f2,0.8123159,0.03478985,0.85201794,0.76502734,0.8370044,0.7920792,0.81545067
8,lift_top_group,1.1752632,1.0736606,1.925,0.0,1.925,2.0263157,0.0
9,logloss,0.6613912,0.026552366,0.6686925,0.6799435,0.6881766,0.622428,0.6477155



See the whole table with table.as_data_frame()

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,iteration,lambda,predictors,deviance_train,deviance_test,deviance_xval,deviance_se,training_rmse,training_logloss,training_r2,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2020-10-13 19:19:19,0.000 sec,4,12.0,33687,0.441924,,0.0,0.0,,,,,,,
1,,2020-10-13 19:19:20,0.322 sec,0,12.0,1,1.386234,,-1.0,-1.0,0.201499,0.220962,0.837583,,,1.98454,0.0
2,,2020-10-13 19:19:20,0.856 sec,0,12.0,1,1.386234,,-1.0,-1.0,,,,,,,
3,,2020-10-13 19:19:20,1.195 sec,0,12.0,1,1.386234,,-1.0,-1.0,,,,,,,
4,,2020-10-13 19:19:21,1.510 sec,0,12.0,1,1.386234,,-1.0,-1.0,,,,,,,
5,,2020-10-13 19:19:21,1.827 sec,0,12.0,1,1.386234,,-1.0,-1.0,,,,,,,




In [30]:
perf = aml.leader.model_performance(test_h2o)
#perf.auc()

In [31]:
perf


ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.2226025239059446
RMSE: 0.4718077192097906
LogLoss: 0.635961924940708
Null degrees of freedom: 165
Residual degrees of freedom: -33521
Null deviance: 230.10377406125804
Residual deviance: 211.13935908031507
AIC: 67585.13935908032
AUC: 0.7036875725900116
AUCPR: 0.6976050117686056
Gini: 0.40737514518002316

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.37997606303680626: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,22.0,60.0,0.7317,(60.0/82.0)
1,1,4.0,80.0,0.0476,(4.0/84.0)
2,Total,26.0,140.0,0.3855,(64.0/166.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.379976,0.714286,139.0
1,max f2,0.338433,0.843621,149.0
2,max f0point5,0.547792,0.666667,53.0
3,max accuracy,0.428871,0.662651,107.0
4,max precision,0.844221,1.0,0.0
5,max recall,0.274359,1.0,162.0
6,max specificity,0.844221,1.0,0.0
7,max absolute_mcc,0.428871,0.337363,107.0
8,max min_per_class_accuracy,0.496292,0.654762,82.0
9,max mean_per_class_accuracy,0.428871,0.660859,107.0



Gains/Lift Table: Avg response rate: 50.60 %, avg score: 49.10 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.012048,0.776098,1.97619,1.97619,1.0,0.815626,1.0,0.815626,0.02381,0.02381,97.619048,97.619048,0.02381
1,2,0.024096,0.737304,1.97619,1.97619,1.0,0.754571,1.0,0.785099,0.02381,0.047619,97.619048,97.619048,0.047619
2,3,0.03012,0.704997,1.97619,1.97619,1.0,0.733508,1.0,0.774781,0.011905,0.059524,97.619048,97.619048,0.059524
3,4,0.042169,0.699911,0.988095,1.693878,0.5,0.702335,0.857143,0.754082,0.011905,0.071429,-1.190476,69.387755,0.059233
4,5,0.054217,0.694862,0.988095,1.537037,0.5,0.697904,0.777778,0.741598,0.011905,0.083333,-1.190476,53.703704,0.058943
5,6,0.10241,0.63158,1.482143,1.511204,0.75,0.65663,0.764706,0.701613,0.071429,0.154762,48.214286,51.120448,0.105981
6,7,0.150602,0.61221,0.988095,1.34381,0.5,0.623597,0.68,0.676648,0.047619,0.202381,-1.190476,34.380952,0.10482
7,8,0.204819,0.58062,1.537037,1.394958,0.777778,0.597224,0.705882,0.655624,0.083333,0.285714,53.703704,39.495798,0.163763
8,9,0.301205,0.552836,1.482143,1.422857,0.75,0.565244,0.72,0.626702,0.142857,0.428571,48.214286,42.285714,0.25784
9,10,0.403614,0.520997,0.929972,1.297797,0.470588,0.537153,0.656716,0.603981,0.095238,0.52381,-7.002801,29.779673,0.243322





