<a href="https://colab.research.google.com/github/BradenAnderson/Twitter-Sentiment-Analysis/blob/main/04_Multinomial_Naive_Bayes_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## This notebook contains the code to perform hyperparameter tuning on Multinomial Naive Bayes Models. 

## Displaying and reviewing the search results is done in the 04_Modeling_Analysis notebook.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split, cross_validate, cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.compose import ColumnTransformer

from sklearn.metrics import precision_score, recall_score, accuracy_score, SCORERS, multilabel_confusion_matrix, make_scorer, roc_curve, roc_auc_score, f1_score

In [None]:
filepath= "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/vader_full_preprocessing_model_droppedlt3.csv"

tweet_df = pd.read_csv(filepath)

tweet_df.head()

Unnamed: 0,label,tweet,Clean_Tweet,Sentence_Level_pos_Score,Sentence_Level_neg_Score,Sentence_Level_neu_Score,Sentence_Level_compound_Score
0,0,@user when a father is dysfunctional and is s...,father dysfunctional significant selfish pron ...,0.0,0.211,0.789,0.5852
1,0,@user @user thanks for #lyft credit i can't us...,thank #lyft credit use cause pron offer wheelc...,0.157,0.0,0.843,1.33525
2,0,bihday your majesty,bihday pron majesty,0.0,0.0,1.0,1.0
3,0,#model i love u take with u all the time in ...,#model love pron pron time pron happy love hap...,0.194,0.0,0.806,1.36245
4,0,factsguide: society now #motivation,factsguide society #motivation,0.0,0.0,1.0,1.0


In [None]:
# Regex pattern to split the tweets into tokens.
pattern=r'\b\w\w+\b|(?<!\w)#\w+'

In [None]:
'''
X = tweet_df.loc[:, ['Clean_Tweet']]
y = tweet_df.loc[:, 'label'].to_numpy().ravel()

preprocess = ColumnTransformer(transformers=[("Tfidf_Vect", TfidfVectorizer(token_pattern=pattern), 'Clean_Tweet')],
                               remainder='passthrough')

naive_baye = MultinomialNB() 

model_pipeline = Pipeline([("textPreprocess", preprocess),
                           ('Baye', naive_baye)])

parameter_grid = [{'textPreprocess__Tfidf_Vect__analyzer' : ['word', 'char', 'char_wb'],
                   'textPreprocess__Tfidf_Vect__stop_words' : ['english', None],
                   'textPreprocess__Tfidf_Vect__ngram_range' : [(1,1), (1,2)],
                   'textPreprocess__Tfidf_Vect__max_df' : [0.7, 0.8, 0.9, 0.95, 1.0], 
                   'textPreprocess__Tfidf_Vect__min_df' : [1, 5, 10, 20],
                   'Baye__fit_prior' : [True, False]}]

score_types = {'f1_score' : make_scorer(f1_score), 'sensitivity' : make_scorer(recall_score), 'specificity' : make_scorer(recall_score, pos_label=0),
               'AUC_ROC' : 'roc_auc', 'ROC_AUC_Score' : make_scorer(roc_auc_score), 'accuracy' : 'accuracy', 'precision' : make_scorer(precision_score)}

gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring=score_types, refit='f1_score', n_jobs=-1)

gs.fit(X,y)

PATH = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/naive_bayes_gs1.pkl'

with open(PATH, 'wb') as file:
  pickle.dump(gs, file)
'''

In [None]:
# Imblearn pipeline is the same as sklearn pipeline with added functionality to support over sampling.
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

In [None]:
'''
X = tweet_df.loc[:, ['Clean_Tweet']]
y = tweet_df.loc[:, 'label'].to_numpy().ravel()

preprocess = ColumnTransformer(transformers=[("Tfidf_Vect", TfidfVectorizer(token_pattern=pattern), 'Clean_Tweet')],
                               remainder='passthrough')

naive_baye = MultinomialNB() 

random_os = RandomOverSampler()

model_pipeline = Pipeline([("textPreprocess", preprocess),
                           ('overSampler', random_os),
                           ('Baye', naive_baye)])

parameter_grid = [{'textPreprocess__Tfidf_Vect__analyzer' : ['word', 'char', 'char_wb'],
                   'textPreprocess__Tfidf_Vect__stop_words' : ['english', None],
                   'textPreprocess__Tfidf_Vect__ngram_range' : [(1,1), (1,2)],
                   'textPreprocess__Tfidf_Vect__max_df' : [0.7, 0.8, 0.9, 0.95, 1.0], 
                   'textPreprocess__Tfidf_Vect__min_df' : [1, 5, 10, 20],
                   'overSampler__sampling_strategy' : ['auto', 0.6, 0.4],
                   'Baye__fit_prior' : [True, False]}]

score_types = {'f1_score' : make_scorer(f1_score), 'sensitivity' : make_scorer(recall_score), 'specificity' : make_scorer(recall_score, pos_label=0),
               'AUC_ROC' : 'roc_auc', 'ROC_AUC_Score' : make_scorer(roc_auc_score), 'accuracy' : 'accuracy', 'precision' : make_scorer(precision_score)}

gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring=score_types, refit='f1_score', n_jobs=-1)

gs.fit(X,y)

PATH = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/naive_bayes_ros_gs1.pkl'

with open(PATH, 'wb') as file:
  pickle.dump(gs, file)
'''

In [None]:
'''
X = tweet_df.loc[:, ['Clean_Tweet', 'Sentence_Level_pos_Score', 'Sentence_Level_neg_Score', 'Sentence_Level_compound_Score']]
y = tweet_df.loc[:, 'label'].to_numpy().ravel()

preprocess = ColumnTransformer(transformers=[("Tfidf_Vect", TfidfVectorizer(token_pattern=pattern), 'Clean_Tweet')],
                               remainder='passthrough')

naive_baye = MultinomialNB() 

random_os = RandomOverSampler()

model_pipeline = Pipeline([("textPreprocess", preprocess),
                           ('overSampler', random_os),
                           ('Baye', naive_baye)])

parameter_grid = [{'textPreprocess__Tfidf_Vect__analyzer' : ['word', 'char', 'char_wb'],
                   'textPreprocess__Tfidf_Vect__stop_words' : ['english', None],
                   'textPreprocess__Tfidf_Vect__ngram_range' : [(1,1), (1,2)],
                   'textPreprocess__Tfidf_Vect__max_df' : [0.7, 0.8, 0.9, 0.95, 1.0], 
                   'textPreprocess__Tfidf_Vect__min_df' : [1, 5, 10, 20],
                   'overSampler__sampling_strategy' : ['auto', 0.6, 0.4],
                   'Baye__fit_prior' : [True, False]}]

score_types = {'f1_score' : make_scorer(f1_score), 'sensitivity' : make_scorer(recall_score), 'specificity' : make_scorer(recall_score, pos_label=0),
               'AUC_ROC' : 'roc_auc', 'ROC_AUC_Score' : make_scorer(roc_auc_score), 'accuracy' : 'accuracy', 'precision' : make_scorer(precision_score)}

gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring=score_types, refit='f1_score', n_jobs=-1)

gs.fit(X,y)

PATH = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/naive_bayes_ros_v_gs1.pkl'

with open(PATH, 'wb') as file:
  pickle.dump(gs, file)
'''

In [None]:
'''
X = tweet_df.loc[:, ['Clean_Tweet']]
y = tweet_df.loc[:, 'label'].to_numpy().ravel()

preprocess = ColumnTransformer(transformers=[("Tfidf_Vect", TfidfVectorizer(token_pattern=pattern), 'Clean_Tweet')],
                               remainder='passthrough')

naive_baye = MultinomialNB() 

random_os = RandomOverSampler()

model_pipeline = Pipeline([("textPreprocess", preprocess),
                           ('overSampler', random_os),
                           ('Baye', naive_baye)])

parameter_grid = [{'textPreprocess__Tfidf_Vect__analyzer' : ['word'],
                   'textPreprocess__Tfidf_Vect__stop_words' : ['english', None],
                   'textPreprocess__Tfidf_Vect__ngram_range' : [(1,1), (1,2)],
                   'textPreprocess__Tfidf_Vect__max_df' : [0.85, 0.9, 0.95, 1.0], 
                   'textPreprocess__Tfidf_Vect__min_df' : [1, 2, 3],
                   'overSampler__sampling_strategy' : [0.3, 0.4, 0.5],
                   'Baye__fit_prior' : [True],
                   'Baye__alpha' : [0.0, 0.5, 1.0, 1.5, 2.0]}]

score_types = {'f1_score' : make_scorer(f1_score), 'sensitivity' : make_scorer(recall_score), 'specificity' : make_scorer(recall_score, pos_label=0),
               'AUC_ROC' : 'roc_auc', 'ROC_AUC_Score' : make_scorer(roc_auc_score), 'accuracy' : 'accuracy', 'precision' : make_scorer(precision_score)}

gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring=score_types, refit='f1_score', n_jobs=-1)

gs.fit(X,y)

PATH = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/naive_bayes_ros_gs2.pkl'

with open(PATH, 'wb') as file:
  pickle.dump(gs, file)
'''

In [None]:
# Import synthetic minority over sampling.
from imblearn.over_sampling import SMOTE

In [None]:
'''
X = tweet_df.loc[:, ['Clean_Tweet']]
y = tweet_df.loc[:, 'label'].to_numpy().ravel()

preprocess = ColumnTransformer(transformers=[("Tfidf_Vect", TfidfVectorizer(token_pattern=pattern), 'Clean_Tweet')],
                               remainder='passthrough')

naive_baye = MultinomialNB() 

smote_os = SMOTE()

model_pipeline = Pipeline([("textPreprocess", preprocess),
                           ('overSampler', smote_os),
                           ('Baye', naive_baye)])

parameter_grid = [{'textPreprocess__Tfidf_Vect__analyzer' : ['word'],
                   'textPreprocess__Tfidf_Vect__stop_words' : ['english'],
                   'textPreprocess__Tfidf_Vect__ngram_range' : [(1,1), (1,2), (1,3)],
                   'textPreprocess__Tfidf_Vect__max_df' : [0.85, 1.0], 
                   'textPreprocess__Tfidf_Vect__min_df' : [1],
                   'overSampler__sampling_strategy' : ['auto', 0.4, 0.5, 0.6, 0.7],
                   'overSampler__k_neighbors' : [1, 3, 5],
                   'Baye__fit_prior' : [True],
                   'Baye__alpha' : [0.5, 1.0, 1.5]}]

score_types = {'f1_score' : make_scorer(f1_score), 'sensitivity' : make_scorer(recall_score), 'specificity' : make_scorer(recall_score, pos_label=0),
               'AUC_ROC' : 'roc_auc', 'ROC_AUC_Score' : make_scorer(roc_auc_score), 'accuracy' : 'accuracy', 'precision' : make_scorer(precision_score)}

gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring=score_types, refit='f1_score', n_jobs=1)

gs.fit(X,y)

PATH = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/naive_bayes_smote_gs1.pkl'

with open(PATH, 'wb') as file:
  pickle.dump(gs, file)
'''

In [None]:
'''
X = tweet_df.loc[:, ['Clean_Tweet']]
y = tweet_df.loc[:, 'label'].to_numpy().ravel()

preprocess = ColumnTransformer(transformers=[("Tfidf_Vect", TfidfVectorizer(token_pattern=pattern), 'Clean_Tweet')],
                               remainder='passthrough')

naive_baye = MultinomialNB() 

smote_os = SMOTE()

model_pipeline = Pipeline([("textPreprocess", preprocess),
                           ('overSampler', smote_os),
                           ('Baye', naive_baye)])

parameter_grid = [{'textPreprocess__Tfidf_Vect__analyzer' : ['word'],
                   'textPreprocess__Tfidf_Vect__stop_words' : ['english'],
                   'textPreprocess__Tfidf_Vect__ngram_range' : [(1,2)],
                   'textPreprocess__Tfidf_Vect__max_df' : [1.0], 
                   'textPreprocess__Tfidf_Vect__min_df' : [1],
                   'overSampler__sampling_strategy' : [0.3, 0.4],
                   'overSampler__k_neighbors' : [1, 3, 5, 7],
                   'Baye__fit_prior' : [True],
                   'Baye__alpha' : [0.4, 0.5, 0.6]}]

score_types = {'f1_score' : make_scorer(f1_score), 'sensitivity' : make_scorer(recall_score), 'specificity' : make_scorer(recall_score, pos_label=0),
               'AUC_ROC' : 'roc_auc', 'ROC_AUC_Score' : make_scorer(roc_auc_score), 'accuracy' : 'accuracy', 'precision' : make_scorer(precision_score)}

gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring=score_types, refit='f1_score', n_jobs=1)

gs.fit(X,y)

PATH = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/naive_bayes_smote_gs2.pkl'

with open(PATH, 'wb') as file:
  pickle.dump(gs, file)
'''

In [None]:
'''
X = tweet_df.loc[:, ['Clean_Tweet']]
y = tweet_df.loc[:, 'label'].to_numpy().ravel()

preprocess = ColumnTransformer(transformers=[("Tfidf_Vect", TfidfVectorizer(token_pattern=pattern), 'Clean_Tweet')],
                               remainder='passthrough')

naive_baye = MultinomialNB() 

smote_os = SMOTE()

model_pipeline = Pipeline([("textPreprocess", preprocess),
                           ('overSampler', smote_os),
                           ('Baye', naive_baye)])

parameter_grid = [{'textPreprocess__Tfidf_Vect__analyzer' : ['word'],
                   'textPreprocess__Tfidf_Vect__stop_words' : ['english'],
                   'textPreprocess__Tfidf_Vect__ngram_range' : [(1,2)],
                   'textPreprocess__Tfidf_Vect__max_df' : [1.0], 
                   'textPreprocess__Tfidf_Vect__min_df' : [1],
                   'overSampler__sampling_strategy' : [0.3, 0.4],
                   'overSampler__k_neighbors' : [7, 9],
                   'Baye__fit_prior' : [True],
                   'Baye__alpha' : [0.3, 0.4]}]

score_types = {'f1_score' : make_scorer(f1_score), 'sensitivity' : make_scorer(recall_score), 'specificity' : make_scorer(recall_score, pos_label=0),
               'AUC_ROC' : 'roc_auc', 'ROC_AUC_Score' : make_scorer(roc_auc_score), 'accuracy' : 'accuracy', 'precision' : make_scorer(precision_score)}

gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring=score_types, refit='f1_score', n_jobs=1)

gs.fit(X,y)

PATH = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/naive_bayes_smote_gs3.pkl'

with open(PATH, 'wb') as file:
  pickle.dump(gs, file)
'''