In [10]:
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import re
import pandas as pd
import joblib

dataset_dir = "imdb"
# dataset_dir = "sentiment140"
# dataset_dir = "coronaNLP"

# n_gram = (1, 1)
n_gram = (1, 2)
# n_gram = (2, 2)

# importing the processed dataframe
df = joblib.load(f'../dataframes/df_{dataset_dir}.pkl')


In [11]:

tfidf = joblib.load(f'../vectors/vectorizer_{dataset_dir}_{n_gram}.pkl')


In [12]:


X = df.iloc[:, 0]

X = tfidf.transform(X)

y = df.iloc[:, 1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


In [17]:
X

<49582x143417 sparse matrix of type '<class 'numpy.float64'>'
	with 7196211 stored elements in Compressed Sparse Row format>

In [13]:

# creating our pipeline that will return an estimator
pipeline = Pipeline([('clf', BernoulliNB())])

parameters = {
    'clf__fit_prior': (False, True),
    # 'clf__binarize': (0.25, 0.5, 1.0),
    'clf__alpha': (1, 0.1, 0.01, 0.001)
}

clf = GridSearchCV(pipeline, param_grid=parameters, cv=5, verbose=1)


In [14]:

clf.fit(X_train, y_train)


Fitting 5 folds for each of 8 candidates, totalling 40 fits


GridSearchCV(cv=5, estimator=Pipeline(steps=[('clf', BernoulliNB())]),
             param_grid={'clf__alpha': (1, 0.1, 0.01, 0.001),
                         'clf__fit_prior': (False, True)},
             verbose=1)

In [15]:

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

print("Best: %f using %s" % (clf.best_score_,
                             clf.best_params_))
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
params = clf.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


acc = int(accuracy_score(y_test, y_pred)*100)


              precision    recall  f1-score   support

           0       0.89      0.86      0.88      4939
           1       0.87      0.90      0.88      4978

    accuracy                           0.88      9917
   macro avg       0.88      0.88      0.88      9917
weighted avg       0.88      0.88      0.88      9917

[[4258  681]
 [ 503 4475]]
Best: 0.884886 using {'clf__alpha': 1, 'clf__fit_prior': True}
0.884886 (0.001249) with: {'clf__alpha': 1, 'clf__fit_prior': False}
0.884886 (0.001239) with: {'clf__alpha': 1, 'clf__fit_prior': True}
0.883499 (0.002523) with: {'clf__alpha': 0.1, 'clf__fit_prior': False}
0.883499 (0.002523) with: {'clf__alpha': 0.1, 'clf__fit_prior': True}
0.879289 (0.002287) with: {'clf__alpha': 0.01, 'clf__fit_prior': False}
0.879289 (0.002225) with: {'clf__alpha': 0.01, 'clf__fit_prior': True}
0.872432 (0.002031) with: {'clf__alpha': 0.001, 'clf__fit_prior': False}
0.872432 (0.002031) with: {'clf__alpha': 0.001, 'clf__fit_prior': True}


In [16]:

# exporting the model
joblib.dump(clf.best_estimator_, open(f'../models/bnb_{dataset_dir}_{acc}', 'wb'))
