# Import Tools

In [1]:
###Modules Utilized 
import sklearn
import nltk
import numpy as np
import pandas as pd
from pycaret.classification import *
from pycaret.datasets import get_data
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

# Load Data

In [2]:
#data used to train the supervised models in milestone 2
data = pd.read_parquet('supervised_learning_data')

In [3]:
#data = data.sample(frac=0.5, replace=True, random_state=1)

#binary classification
data = data[data.sentiment != 1]

data = data.rename(columns={"pre_process": "review",
                             "sentiment": "Flag_1"})
data.head()

Unnamed: 0,review,Flag_1
0,engrossing page turner race class bullying gra...,2
1,interesting debut “ hello niveus high it ’ s t...,2
2,wow ordered book teenage daughter luvs,2
3,definitely ya novel great plot childish charac...,2
4,timely book engrossing story intrigued premise...,2


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 337148 entries, 0 to 362923
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   review  337148 non-null  object
 1   Flag_1  337148 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 7.7+ MB


# Vectorize Data

Sklearn provides facilities to extract numerical features from a text document by tokenizing, counting and normalising. CountVectorizer performs the task of tokenizing and counting, while TfidfTransformer normalizes the data. TfidfVectorizer, on the other hand, performs all three operations, thereby streamlining the process of natural language processing.

### Create Dataframe 

In [5]:
#Set vectorizers
%time v = TfidfVectorizer(max_features=800, ngram_range=(1, 4))
%time V = v.fit_transform(data['review'])

df_tf = pd.DataFrame(V.toarray(), columns=v.vocabulary_)
#display(df_tf.head())

CPU times: user 11 µs, sys: 9 µs, total: 20 µs
Wall time: 23.8 µs
CPU times: user 3min, sys: 11.2 s, total: 3min 11s
Wall time: 3min 12s


In [6]:
#Preparing processed and BoW-TF embedded data for Classification
df_tf_m = pd.concat([data, df_tf], axis = 1)
df_tf_m.drop(columns=['review'], inplace = True)
print(df_tf_m.shape)
#df_tf_m.head()

(361528, 800)


In [7]:
df_tf_m = df_tf_m.dropna()

# Supervised Learning Set-Up

### Initiate Pipeline

In [8]:
#build models using data
%time s = setup(df_tf_m, target = 'Flag_1', feature_selection = True, fix_imbalance=True,train_size = 0.80,session_id = 5)

Unnamed: 0,Description,Value
0,Session id,5
1,Target,Flag_1
2,Target type,Binary
3,Target mapping,"0.0: 0, 2.0: 1"
4,Original data shape,"(312768, 800)"
5,Transformed data shape,"(514786, 160)"
6,Transformed train set shape,"(452232, 160)"
7,Transformed test set shape,"(62554, 160)"
8,Numeric features,799
9,Preprocess,True


CPU times: user 16min 39s, sys: 48.4 s, total: 17min 28s
Wall time: 1min 52s


In [9]:
%time best_models = compare_models(n_select = 3)
save_model(best_models[0] , 'bestmodel1')
save_model(best_models[1] , 'bestmodel2')
save_model(best_models[2] , 'bestmodel3')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8787,0.5039,0.969,0.9038,0.9352,0.001,0.0012,26.928
rf,Random Forest Classifier,0.8622,0.5003,0.9488,0.9036,0.9256,-0.0014,-0.0014,20.709
lightgbm,Light Gradient Boosting Machine,0.8185,0.507,0.8945,0.9037,0.8991,0.0002,0.0002,18.875
gbc,Gradient Boosting Classifier,0.7382,0.4929,0.7961,0.9026,0.846,-0.0064,-0.0071,48.708
dt,Decision Tree Classifier,0.7319,0.5012,0.7866,0.9043,0.8413,0.0033,0.0037,18.807
ada,Ada Boost Classifier,0.6698,0.4958,0.711,0.903,0.7956,-0.003,-0.0036,22.41
svm,SVM - Linear Kernel,0.6507,0.0,0.6852,0.9053,0.7798,0.0064,0.008,14.146
lr,Logistic Regression,0.641,0.5146,0.673,0.9055,0.7721,0.0067,0.0087,19.56
lda,Linear Discriminant Analysis,0.6383,0.5153,0.6696,0.9056,0.7699,0.007,0.0091,15.118
ridge,Ridge Classifier,0.6382,0.0,0.6692,0.9059,0.7697,0.0079,0.0104,14.4


CPU times: user 36min 38s, sys: 2min 59s, total: 39min 38s
Wall time: 54min 18s
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['page', 'race', 'class', 'ive',
                                              'never', 'fan', 'girl', 'book',
                                              'series', 'enjoyed', 'get', 'isnt',
                                              'favorite', 'however',
                                              'incredibly', 'awesom...
                  LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                 colsample_bytree=1.0, importance_type='split',
                                 learning_rate=0.1, max_depth=-1,
                                 min_child_samples=20, min_child_weight

In [10]:
%time ensemble_et = ensemble_model(best_models[0])
save_model(ensemble_et , 'ensemble_et')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.879,0.5025,0.9694,0.9037,0.9354,0.0007,0.0008
1,0.8792,0.5036,0.9689,0.9043,0.9355,0.01,0.0118
2,0.8792,0.5101,0.9696,0.9037,0.9355,0.001,0.0012
3,0.8807,0.505,0.9707,0.9043,0.9363,0.0104,0.0124
4,0.8781,0.5045,0.968,0.9039,0.9349,0.0035,0.0041
5,0.8787,0.5025,0.9695,0.9033,0.9352,-0.0068,-0.0082
6,0.8803,0.5023,0.9708,0.9039,0.9362,0.0039,0.0047
7,0.8789,0.5078,0.9694,0.9036,0.9354,-0.0016,-0.0019
8,0.8786,0.5135,0.969,0.9037,0.9352,-0.0004,-0.0005
9,0.8792,0.4959,0.9701,0.9033,0.9355,-0.0054,-0.0065


CPU times: user 8min 18s, sys: 1min 8s, total: 9min 27s
Wall time: 26min 14s
Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['page', 'race', 'class', 'ive',
                                              'never', 'fan', 'girl', 'book',
                                              'series', 'enjoyed', 'get', 'isnt',
                                              'favorite', 'however',
                                              'incredibly', 'awesom...
                                                                   max_leaf_nodes=None,
                                                                   max_samples=None,
                                                                   min_impurity_decrease=0.0,
                        

In [11]:
%time ensemble_rf = ensemble_model(best_models[1])
save_model(ensemble_rf , 'ensemble_rf')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8651,0.4943,0.9523,0.9036,0.9273,-0.001,-0.0011
1,0.8647,0.4911,0.9514,0.9039,0.9271,0.0032,0.0034
2,0.8632,0.5009,0.9506,0.9031,0.9262,-0.0081,-0.0086
3,0.8659,0.496,0.9522,0.9044,0.9277,0.0106,0.0113
4,0.8625,0.5008,0.9484,0.9041,0.9257,0.0056,0.0059
5,0.8633,0.5,0.9501,0.9036,0.9263,-0.0017,-0.0018
6,0.8634,0.4985,0.9498,0.9039,0.9263,0.0031,0.0033
7,0.8626,0.5097,0.9489,0.9039,0.9258,0.0025,0.0027
8,0.8626,0.5041,0.9492,0.9036,0.9259,-0.0007,-0.0008
9,0.8622,0.502,0.9498,0.9028,0.9257,-0.0128,-0.0136


CPU times: user 7min 6s, sys: 52.9 s, total: 7min 59s
Wall time: 15min 39s
Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['page', 'race', 'class', 'ive',
                                              'never', 'fan', 'girl', 'book',
                                              'series', 'enjoyed', 'get', 'isnt',
                                              'favorite', 'however',
                                              'incredibly', 'awesom...
                                                                     max_leaf_nodes=None,
                                                                     max_samples=None,
                                                                     min_impurity_decrease=0.0,
                  

In [12]:
%time ensemble_lgbm = ensemble_model(best_models[2])
save_model(ensemble_lgbm , 'ensemble_lgbm')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8203,0.5097,0.8978,0.9028,0.9003,-0.009,-0.009
1,0.8218,0.5092,0.8985,0.9037,0.9011,0.0002,0.0002
2,0.8182,0.5047,0.8936,0.9041,0.8988,0.0038,0.0038
3,0.8235,0.5203,0.8993,0.9048,0.902,0.0114,0.0114
4,0.82,0.5116,0.8952,0.9047,0.8999,0.0093,0.0094
5,0.8157,0.4893,0.892,0.9029,0.8974,-0.0076,-0.0076
6,0.8235,0.5125,0.8987,0.9053,0.902,0.0164,0.0164
7,0.8188,0.5019,0.8959,0.9028,0.8994,-0.0088,-0.0088
8,0.8188,0.5209,0.8952,0.9034,0.8993,-0.0026,-0.0026
9,0.8185,0.4988,0.8958,0.9027,0.8992,-0.0101,-0.0101


CPU times: user 24min 10s, sys: 25.1 s, total: 24min 35s
Wall time: 6min 8s
Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['page', 'race', 'class', 'ive',
                                              'never', 'fan', 'girl', 'book',
                                              'series', 'enjoyed', 'get', 'isnt',
                                              'favorite', 'however',
                                              'incredibly', 'awesom...
                                                             min_child_samples=20,
                                                             min_child_weight=0.001,
                                                             min_split_gain=0.0,
                                          

In [13]:
%time blender_weighted_med = blend_models(best_models, weights = [0.50,0.30,0.20])
save_model(blender_weighted_med , 'blender_weighted_med')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8769,0.5003,0.9665,0.9039,0.9342,0.0036,0.0042
1,0.875,0.5038,0.9648,0.9035,0.9331,-0.0034,-0.0039
2,0.8748,0.5084,0.9642,0.9037,0.933,0.0004,0.0005
3,0.8762,0.5052,0.9652,0.9043,0.9337,0.0093,0.0107
4,0.8735,0.5027,0.9626,0.9037,0.9322,-0.0006,-0.0007
5,0.8737,0.4994,0.9636,0.9031,0.9324,-0.0102,-0.0116
6,0.8763,0.5081,0.9654,0.9042,0.9338,0.0073,0.0084
7,0.8748,0.5095,0.9644,0.9035,0.933,-0.0027,-0.0031
8,0.8743,0.5108,0.9636,0.9037,0.9327,0.0002,0.0003
9,0.8749,0.5004,0.9649,0.9032,0.933,-0.0068,-0.0078


CPU times: user 8min 16s, sys: 1min, total: 9min 17s
Wall time: 11min 13s
Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['page', 'race', 'class', 'ive',
                                              'never', 'fan', 'girl', 'book',
                                              'series', 'enjoyed', 'get', 'isnt',
                                              'favorite', 'however',
                                              'incredibly', 'awesom...
                                                               learning_rate=0.1,
                                                               max_depth=-1,
                                                               min_child_samples=20,
                                               

In [14]:
%time blender_weighted_soft = blend_models(best_models, weights = [0.50,0.30,0.20], method='soft')
save_model(blender_weighted_soft , 'blender_weighted_soft')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8777,0.5018,0.9672,0.9042,0.9346,0.0076,0.0088
1,0.8758,0.4983,0.9653,0.9038,0.9335,0.0019,0.0022
2,0.874,0.5034,0.9637,0.9034,0.9326,-0.0049,-0.0056
3,0.8751,0.5043,0.9636,0.9044,0.9331,0.011,0.0125
4,0.8749,0.5069,0.9632,0.9046,0.933,0.0138,0.0156
5,0.8736,0.4934,0.9633,0.9033,0.9323,-0.0066,-0.0075
6,0.8764,0.5081,0.9659,0.9039,0.9339,0.0034,0.0039
7,0.8734,0.5138,0.9627,0.9035,0.9322,-0.0022,-0.0025
8,0.8741,0.5133,0.9636,0.9035,0.9326,-0.0032,-0.0037
9,0.8748,0.5014,0.9649,0.9032,0.933,-0.0079,-0.0091


CPU times: user 6min 19s, sys: 59 s, total: 7min 18s
Wall time: 10min 59s
Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['page', 'race', 'class', 'ive',
                                              'never', 'fan', 'girl', 'book',
                                              'series', 'enjoyed', 'get', 'isnt',
                                              'favorite', 'however',
                                              'incredibly', 'awesom...
                                                               learning_rate=0.1,
                                                               max_depth=-1,
                                                               min_child_samples=20,
                                               

In [15]:
%time stacker = stack_models(best_models)
save_model(stacker , 'stacker')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8833,0.5121,0.9736,0.9045,0.9378,0.0131,0.0162
1,0.8814,0.5084,0.9721,0.9039,0.9368,0.0036,0.0043
2,0.8803,0.5148,0.9713,0.9035,0.9361,-0.0032,-0.0039
3,0.8818,0.5059,0.9728,0.9038,0.937,0.0014,0.0018
4,0.8828,0.5101,0.9736,0.9041,0.9375,0.0063,0.0078
5,0.8783,0.4955,0.969,0.9034,0.935,-0.0052,-0.0062
6,0.8809,0.5047,0.9717,0.9038,0.9365,0.0016,0.002
7,0.8804,0.5177,0.9708,0.904,0.9362,0.0046,0.0055
8,0.8775,0.5113,0.9681,0.9033,0.9346,-0.0059,-0.007
9,0.8816,0.5094,0.9732,0.9033,0.9369,-0.0066,-0.0082


CPU times: user 10min 52s, sys: 2min 24s, total: 13min 16s
Wall time: 43min 59s
Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['page', 'race', 'class', 'ive',
                                              'never', 'fan', 'girl', 'book',
                                              'series', 'enjoyed', 'get', 'isnt',
                                              'favorite', 'however',
                                              'incredibly', 'awesom...
                                                                 subsample_for_bin=200000,
                                                                 subsample_freq=0))],
                                     final_estimator=LogisticRegression(C=1.0,
                                   

In [18]:
%time stacker_et = stack_models(best_models, meta_model = best_models[0])
save_model(stacker , 'stacker_et')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8854,0.4994,0.9768,0.9041,0.939,0.0064,0.0082
1,0.8843,0.495,0.9763,0.9035,0.9385,-0.0032,-0.0041
2,0.8845,0.4938,0.9763,0.9036,0.9385,-0.0019,-0.0024
3,0.8859,0.5049,0.9775,0.904,0.9393,0.0061,0.008
4,0.883,0.5041,0.9743,0.9038,0.9377,0.0007,0.0009
5,0.8839,0.4834,0.9759,0.9033,0.9382,-0.0069,-0.0088
6,0.8852,0.4932,0.9767,0.9039,0.9389,0.0043,0.0056
7,0.8847,0.5005,0.9765,0.9037,0.9387,-0.0004,-0.0006
8,0.8836,0.5017,0.9754,0.9035,0.9381,-0.0027,-0.0035
9,0.8838,0.5004,0.976,0.9032,0.9382,-0.0074,-0.0096


CPU times: user 20min, sys: 1min 35s, total: 21min 35s
Wall time: 45min 10s
Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['page', 'race', 'class', 'ive',
                                              'never', 'fan', 'girl', 'book',
                                              'series', 'enjoyed', 'get', 'isnt',
                                              'favorite', 'however',
                                              'incredibly', 'awesom...
                                                                 subsample_for_bin=200000,
                                                                 subsample_freq=0))],
                                     final_estimator=LogisticRegression(C=1.0,
                                   

In [19]:
%time blender_weighted_hard = blend_models(best_models, weights = [0.50,0.30,0.20], method='hard')
save_model(blender_weighted_hard , 'blender_weighted_hard')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8686,0.0,0.9561,0.904,0.9293,0.004,0.0043
1,0.8702,0.0,0.9583,0.9039,0.9303,0.003,0.0033
2,0.8647,0.0,0.9523,0.9033,0.9271,-0.0059,-0.0063
3,0.8715,0.0,0.9589,0.9046,0.931,0.0133,0.0146
4,0.8656,0.0,0.9526,0.9039,0.9276,0.002,0.0022
5,0.8664,0.0,0.9542,0.9034,0.9281,-0.0045,-0.0049
6,0.8677,0.0,0.9552,0.9039,0.9288,0.0027,0.003
7,0.8666,0.0,0.954,0.9037,0.9282,0.0006,0.0007
8,0.8655,0.0,0.9533,0.9032,0.9276,-0.0073,-0.0078
9,0.8658,0.0,0.9536,0.9033,0.9278,-0.0054,-0.0058


CPU times: user 4min 54s, sys: 1min 33s, total: 6min 28s
Wall time: 11min 48s
Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['page', 'race', 'class', 'ive',
                                              'never', 'fan', 'girl', 'book',
                                              'series', 'enjoyed', 'get', 'isnt',
                                              'favorite', 'however',
                                              'incredibly', 'awesom...
                                                               learning_rate=0.1,
                                                               max_depth=-1,
                                                               min_child_samples=20,
                                               