In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
import re
import pandas as pd
import joblib
import numpy as np
import xgboost as xgb

xgb.set_config(verbosity=2, use_rmm=True)


In [2]:

# dataset_dir = 'sentiment140'
dataset_dir = 'imdb'
# dataset_dir = 'coronaNLP'

# n_gram = (1, 1)
n_gram = (1, 2)
# n_gram = (2, 2)

# importing the processed dataframe
df = joblib.load(f'./dataframes/df_{dataset_dir}.pkl')

df.head()


Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod youll hoo...,1
1,wonder littl product br br film techniqu veri ...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic famili littl boy jake think zombi closet...,0
4,petter mattei love time money visual stun film...,1


In [3]:


X = df.iloc[:, 0]


y = df.iloc[:, 1]

X, y


(0        one review mention watch 1 oz episod youll hoo...
 1        wonder littl product br br film techniqu veri ...
 2        thought wonder way spend time hot summer weeke...
 3        basic famili littl boy jake think zombi closet...
 4        petter mattei love time money visual stun film...
                                ...                        
 49995    thought movi right good job wasnt creativ orig...
 49996    bad plot bad dialogu bad act idiot direct anno...
 49997    cathol taught parochi elementari school nun ta...
 49998    im go disagre previous comment side maltin one...
 49999    one expect star trek movi high art fan expect ...
 Name: review, Length: 49582, dtype: object,
 0        1
 1        1
 2        1
 3        0
 4        1
         ..
 49995    1
 49996    0
 49997    0
 49998    0
 49999    0
 Name: sentiment, Length: 49582, dtype: int32)

In [4]:

tfidf = joblib.load(
    f"./vectors/vectorizer_{dataset_dir}_{n_gram}.pkl")
tfidf


TfidfVectorizer(max_features=143417, ngram_range=(1, 2),
                tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>)

In [5]:
tfidf.vocabulary_, tfidf.idf_


({'one': 89749,
  'review': 104217,
  'mention': 79983,
  'watch': 135960,
  '1': 16,
  'oz': 92662,
  'episod': 39341,
  'youll': 142899,
  'hook': 60315,
  'right': 104679,
  'exact': 41624,
  'happen': 57545,
  'mebr': 79599,
  'br': 16098,
  'first': 48342,
  'thing': 125575,
  'struck': 121416,
  'brutal': 18456,
  'unflinch': 131668,
  'scene': 107707,
  'violenc': 134465,
  'set': 112169,
  'word': 140232,
  'go': 53878,
  'trust': 130141,
  'show': 113772,
  'faint': 43439,
  'heart': 58491,
  'timid': 128044,
  'pull': 99789,
  'punch': 99847,
  'regard': 103006,
  'drug': 35424,
  'sex': 112555,
  'hardcor': 57935,
  'classic': 23240,
  'use': 132526,
  'wordbr': 140309,
  'call': 19126,
  'nicknam': 87185,
  'given': 53664,
  'oswald': 92254,
  'maximum': 78956,
  'secur': 109667,
  'state': 119430,
  'focus': 49341,
  'main': 76667,
  'emerald': 37637,
  'citi': 23087,
  'experiment': 42502,
  'section': 109658,
  'prison': 98806,
  'cell': 20839,
  'glass': 53787,
  'front

In [6]:
X = tfidf.transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

X_train.shape, y_train.shape


((39665, 143417), (39665,))

In [7]:
# creating our pipeline that will return an estimator
pipeline = Pipeline([('clf', xgb.XGBClassifier(use_label_encoder=False))])

parameters = {
    'clf__booster': ('gbtree', 'gblinear', 'dart'),
    'clf__eta': (0.5, 0.75, 1),
    }

clf = GridSearchCV(pipeline, param_grid=parameters, scoring='accuracy', cv=5, verbose=1)
clf


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('clf',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      enable_categorical=False,
                                                      gamma=None, gpu_id=None,
                                                      importance_type=None,
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None,
                                                      max_depth=None,
                         

In [8]:

clf = clf.fit(X_train, y_train)


Fitting 5 folds for each of 9 candidates, totalling 45 fits
[19:45:26] INFO: D:\Build\xgboost\xgboost-1.5.1.git\src\tree\updater_prune.cc:101: tree pruning end, 96 extra nodes, 0 pruned nodes, max_depth=6
[19:45:26] INFO: D:\Build\xgboost\xgboost-1.5.1.git\src\tree\updater_prune.cc:101: tree pruning end, 94 extra nodes, 0 pruned nodes, max_depth=6
[19:45:27] INFO: D:\Build\xgboost\xgboost-1.5.1.git\src\tree\updater_prune.cc:101: tree pruning end, 88 extra nodes, 0 pruned nodes, max_depth=6
[19:45:27] INFO: D:\Build\xgboost\xgboost-1.5.1.git\src\tree\updater_prune.cc:101: tree pruning end, 94 extra nodes, 0 pruned nodes, max_depth=6
[19:45:28] INFO: D:\Build\xgboost\xgboost-1.5.1.git\src\tree\updater_prune.cc:101: tree pruning end, 78 extra nodes, 0 pruned nodes, max_depth=6
[19:45:28] INFO: D:\Build\xgboost\xgboost-1.5.1.git\src\tree\updater_prune.cc:101: tree pruning end, 74 extra nodes, 0 pruned nodes, max_depth=6
[19:45:29] INFO: D:\Build\xgboost\xgboost-1.5.1.git\src\tree\updater_p

In [9]:


y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       0.89      0.87      0.88      4939
           1       0.87      0.89      0.88      4978

    accuracy                           0.88      9917
   macro avg       0.88      0.88      0.88      9917
weighted avg       0.88      0.88      0.88      9917



In [10]:


print("Best: %f using %s" % (clf.best_score_, 
    clf.best_params_))
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
params = clf.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


Best: 0.877146 using {'clf__booster': 'gblinear', 'clf__eta': 0.5}
0.860330 (0.001457) with: {'clf__booster': 'gbtree', 'clf__eta': 0.5}
0.855288 (0.003174) with: {'clf__booster': 'gbtree', 'clf__eta': 0.75}
0.848229 (0.002438) with: {'clf__booster': 'gbtree', 'clf__eta': 1}
0.877146 (0.006396) with: {'clf__booster': 'gblinear', 'clf__eta': 0.5}
0.842052 (0.005125) with: {'clf__booster': 'gblinear', 'clf__eta': 0.75}
0.811597 (0.006099) with: {'clf__booster': 'gblinear', 'clf__eta': 1}
0.860330 (0.001457) with: {'clf__booster': 'dart', 'clf__eta': 0.5}
0.855288 (0.003174) with: {'clf__booster': 'dart', 'clf__eta': 0.75}
0.848229 (0.002438) with: {'clf__booster': 'dart', 'clf__eta': 1}


In [11]:




from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))

acc = int(accuracy_score(y_test, y_pred)*100)



[[4289  650]
 [ 550 4428]]


In [12]:


test_tweet = "groceri store"
vector = tfidf.transform([test_tweet])

print(clf.predict(vector))



[1]


In [13]:



# exporting the pipeline
joblib.dump(clf.best_estimator_, f'./models/xgb_{dataset_dir}_{acc}_{n_gram}.pkl')




['./models/xgb_imdb_87_(1, 2).pkl']