In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
import re
import pandas as pd
import joblib
import numpy as np
import xgboost as xgb

xgb.set_config(verbosity=1, use_rmm=True)

In [None]:

# dataset_dir = 'sentiment140'
dataset_dir = 'imdb'
# dataset_dir = 'coronaNLP'

n_gram = (1, 1)
# n_gram = (1, 2)
# n_gram = (2, 2)

# importing the processed dataframe
df = joblib.load(f'../dataframes/df_{dataset_dir}.pkl')

df.head()


In [None]:


X = df.iloc[:, 0]


y = df.iloc[:, 1]

X, y


In [None]:

tfidf = joblib.load(
    f"../vectors/vectorizer_{dataset_dir}_{n_gram}.pkl")
tfidf


In [None]:
tfidf.vocabulary_, tfidf.idf_


In [None]:
X = tfidf.transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

X_train.shape, y_train.shape


In [None]:
# creating our pipeline that will return an estimator
pipeline = Pipeline(
    [('clf', xgb.XGBClassifier(max_depth=50, use_label_encoder=False))])

# xg = xgb.XGBClassifier(use_label_encoder=False) 
# xg

In [None]:

parameters = {
    'clf__booster': ('gbtree', 'gblinear', 'dart'),
    'clf__eta': (0.1, 0,25, 0.4, 0.5),
    }

# params = {
#           'booster': ['gbtree', 'gblinear', 'dart'],
#           'max_depth': [3, 6, 10, 15],
#           'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5],
#           'subsample': np.arange(0.5, 1.0, 0.1),
#           'colsample_bytree': np.arange(0.5, 1.0, 0.1),
#           'colsample_bylevel': np.arange(0.5, 1.0, 0.1),
#           'n_estimators': [100, 250, 500, 750],
#           }

clf = GridSearchCV(pipeline, param_grid=parameters,
                   scoring='accuracy', cv=5, verbose=1)

# clf = RandomizedSearchCV(xg, param_distributions=params,
#                          scoring='accuracy',
#                          n_iter=25,
#                          verbose=1)
clf


In [None]:

clf = clf.fit(X_train, y_train)


In [None]:


y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))



In [None]:


print("Best: %f using %s" % (clf.best_score_, 
    clf.best_params_))
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
params = clf.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


In [None]:




from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))

acc = int(accuracy_score(y_test, y_pred)*100)



In [None]:


test_tweet = "groceri store"
vector = tfidf.transform([test_tweet])

print(clf.predict(vector))



In [None]:



# exporting the pipeline
joblib.dump(clf.best_estimator_, f'../models/xgb_{dataset_dir}_{acc}_{n_gram}.pkl')


