In [15]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from wordcloud import WordCloud

import seaborn as sns
import matplotlib.pyplot as plt
import re
import pandas as pd
import joblib
import numpy as np
import xgboost as xgb


In [16]:
# dataset_dir = 'sentiment140'
dataset_dir = 'imdb'
# dataset_dir = 'coronaNLP'

n_gram = (1, 2)

# importing the processed dataframe
df = joblib.load(f'../dataframes/df_{dataset_dir}.pkl')

df.head()


Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod youll hoo...,1
1,wonder littl product br br film techniqu veri ...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic famili littl boy jake think zombi closet...,0
4,petter mattei love time money visual stun film...,1


In [17]:

X = df.iloc[:, 0]


y = df.iloc[:, 1]

X, y


(0        one review mention watch 1 oz episod youll hoo...
 1        wonder littl product br br film techniqu veri ...
 2        thought wonder way spend time hot summer weeke...
 3        basic famili littl boy jake think zombi closet...
 4        petter mattei love time money visual stun film...
                                ...                        
 49995    thought movi right good job wasnt creativ orig...
 49996    bad plot bad dialogu bad act idiot direct anno...
 49997    cathol taught parochi elementari school nun ta...
 49998    im go disagre previous comment side maltin one...
 49999    one expect star trek movi high art fan expect ...
 Name: review, Length: 49582, dtype: object,
 0        1
 1        1
 2        1
 3        0
 4        1
         ..
 49995    1
 49996    0
 49997    0
 49998    0
 49999    0
 Name: sentiment, Length: 49582, dtype: int32)

In [18]:

tfidf = joblib.load(
    f"../vectors/vectorizer_{dataset_dir}_{n_gram}.pkl")
tfidf


TfidfVectorizer(max_features=50000, ngram_range=(1, 2), stop_words='english',
                tokenizer=<bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>)

In [19]:

X = tfidf.transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42)

X_train.shape, y_train.shape


((34707, 50000), (34707,))

In [20]:
estimators = []
estimators.append(('MNB', MultinomialNB()))
estimators.append(('CNB', ComplementNB()))
estimators.append(('XGB',
                  xgb.XGBClassifier(random_state=42, max_depth=50, use_label_encoder=False, learning_rate=0.01)))
estimators.append(('SVC', SVC(probability=True)))
estimators.append(('LRG', LogisticRegression()))


In [21]:
models = [
    MultinomialNB(),
    ComplementNB(),
    xgb.XGBClassifier(max_depth=50, use_label_encoder=False),
    SVC(probability=True),
    LogisticRegression(),
    VotingClassifier(estimators = estimators, voting ='hard')
]

model_to_use = 1

model_idx = model_to_use


In [22]:
params = [
    {
        'fit_prior': (False, True),
        'alpha': (1, 0.1, 0.01, 0.001)
    }, 
    {
        'fit_prior': (False, True),
        'norm': (False, True),
        'alpha': (1, 0.1, 0.01, 0.001)
    },
    {
        'booster': ('gbtree', 'gblinear', 'dart'),
        'eta': (0.1, 0, 25, 0.4, 0.5), 
    },
    {
        'C': ('1', '0.5', '0.25'),
        'kernel': ('rfb', 'linear', 'poly', 'sigmoid'),
    },
    {
        'penalty': ('l2', 'none'),
        'C': np.logspace(-4, 4, 10),
        'solver': ('sag', 'saga', 'newton-cg'),
        'max_iter': (100, 1000, 2500, 5000)
    }
]

In [23]:
clf = models[model_idx]

parameters = params[model_idx]

# clf = GridSearchCV(clf, param_grid=parameters, scoring='accuracy', cv=5, verbose=True) if model_idx != -1 else clf
# clf


In [24]:

clf = clf.fit(X_train, y_train)
# clf.best_estimator_


In [25]:

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.88      0.87      0.88      7410
           1       0.87      0.89      0.88      7465

    accuracy                           0.88     14875
   macro avg       0.88      0.88      0.88     14875
weighted avg       0.88      0.88      0.88     14875



In [26]:

# print("Best: %f using %s" % (clf.best_score_,
#                              clf.best_params_))
# means = clf.cv_results_['mean_test_score']
# stds = clf.cv_results_['std_test_score']
# params = clf.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))


In [27]:

acc = int(accuracy_score(y_test, y_pred)*100)


In [28]:
test_tweet = "groceri store"
vector = tfidf.transform([test_tweet])

print(clf.predict(vector))


[0]


In [29]:

# exporting the pipeline
joblib.dump(clf.best_estimator_,
            f'../models/mnb_{dataset_dir}_{acc}_{n_gram}.pkl')


AttributeError: 'ComplementNB' object has no attribute 'best_estimator_'