<a href="https://colab.research.google.com/github/BradenAnderson/Twitter-Sentiment-Analysis/blob/main/06_LR_Models_MEV_Ft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### This notebook contains the code to perform hyperparameter tuning on Logistic Regression Models that utilize fastText mean embedded word vectors as the input. 

### Displaying and reviewing the search results is done in the 06_fastText_MEV_Modeling_Analysis notebook.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! git clone https://github.com/facebookresearch/fastText.git

In [None]:
! pip install /content/fastText

In [None]:

import dill as pickle

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import fasttext
from fasttext.FastText import load_model

from collections import defaultdict

from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split, cross_validate, cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, SCORERS, multilabel_confusion_matrix, make_scorer, roc_curve, roc_auc_score, f1_score


pd.set_option('display.max_rows', 1000)

In [None]:
filepath= "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/vader_full_preprocessing_model_droppedlt3.csv"

tweet_df = pd.read_csv(filepath)

tweet_df.head()

Unnamed: 0,label,tweet,Clean_Tweet,Sentence_Level_pos_Score,Sentence_Level_neg_Score,Sentence_Level_neu_Score,Sentence_Level_compound_Score
0,0,@user when a father is dysfunctional and is s...,father dysfunctional significant selfish pron ...,0.0,0.211,0.789,0.5852
1,0,@user @user thanks for #lyft credit i can't us...,thank #lyft credit use cause pron offer wheelc...,0.157,0.0,0.843,1.33525
2,0,bihday your majesty,bihday pron majesty,0.0,0.0,1.0,1.0
3,0,#model i love u take with u all the time in ...,#model love pron pron time pron happy love hap...,0.194,0.0,0.806,1.36245
4,0,factsguide: society now #motivation,factsguide society #motivation,0.0,0.0,1.0,1.0


In [None]:
tweet_df['Clean_Word_Lists'] = tweet_df['Clean_Tweet'].apply(lambda tweet : tweet.split(' '))

# Set up fastText

In [None]:
# Read in a csv file that contains a string with every unqiue word found in the set of tweets.
unique_df = pd.read_csv("/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/unique_words.csv")

# Grab the long string of unqiue words from the dataframe.
unique_words = unique_df.loc[unique_df.index == 0, 'Unique_Words'].to_numpy()[0]

# Split the string at white spaces to get a list of unique words.
unique_words = unique_words.split(" ")

In [None]:
# Path to where the fastText word vector model is saved.
word_vector_model_filepath = r"/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/fastText_Models/wv_model_dlt3.bin"

# Load the model of word vector representations that was trained in the previous section.
ft_model = load_model(word_vector_model_filepath)

# Create a dictionary mapping each unique word to its fastText word vector.
word_vector_dictionary = {word : ft_model.get_word_vector(word) for word in unique_words}

In [None]:
class MeanEmbeddingVectorizer(object):

    def __init__(self, ft_wv):
        self.ft_wv = ft_wv
        if len(ft_wv)>0:
            self.dim = ft_wv[next(iter(unique_words))].shape[0] 
        else:
            self.dim=0
            
    def fit(self, X, y):
        return self 

    def transform(self, X):

      mean_of_word_vectors = []

      # For each tweet in our training set (that we have converted to a list of clean word tokens).
      for clean_tweet_token_list in X: 
        
        # Create a list to hold the FastText word vectors for each word in the tweet.
        this_tweet_word_vectors = []

        # For every word in a particular tweet.
        for word in clean_tweet_token_list:
          
          # Initialize the word vector for this word to be a 100-dimension vector of all zeros.
          this_words_vector = np.zeros(self.dim)
          
          # If we have a valid word vector for this word, use the valid word vector instead of the zero vector.
          if word in self.ft_wv: 
            this_words_vector = self.ft_wv[word]
          
          # Append this word vector to the list of word vectors that make up this tweet.
          this_tweet_word_vectors.append(this_words_vector)

        # Create a single 100 dimensional vector that is the (element wise) mean of all word vectors that make up this tweet.
        mean_vector = np.mean(this_tweet_word_vectors, axis=0)

        # Append this mean vector to the list of mean vectors (this list will have one mean vector for every tweet in our dataset).
        mean_of_word_vectors.append(mean_vector)
      
      return np.array(mean_of_word_vectors)

# Perform GridSearch

In [None]:
'''
X = tweet_df.loc[:, ['Clean_Word_Lists']].to_numpy().ravel()
y = tweet_df.loc[:, 'label'].to_numpy().ravel()

LogisticRegression = LogisticRegression()

model_pipeline = Pipeline([("ft_word_vectorizer", MeanEmbeddingVectorizer(word_vector_dictionary)),
                           ('Logistic_Reg', LogisticRegression)])

parameter_grid = [{'Logistic_Reg__penalty': ['l1', 'l2'], 
                   'Logistic_Reg__C' : np.logspace(start=-2,stop=2, base=10, num=25), 
                   'Logistic_Reg__solver': ['liblinear'],
                   'Logistic_Reg__fit_intercept' : [True, False],
                   'Logistic_Reg__class_weight' : [None, {0 : 1, 1: 5}]}]

score_types = {'f1_score' : make_scorer(f1_score), 'sensitivity' : make_scorer(recall_score), 'specificity' : make_scorer(recall_score, pos_label=0),
               'AUC_ROC' : 'roc_auc', 'ROC_AUC_Score' : make_scorer(roc_auc_score), 'accuracy' : 'accuracy', 'precision' : make_scorer(precision_score)}

gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring=score_types, refit='f1_score', n_jobs=-1)

gs.fit(X,y)

PATH = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_lr_mev_ft.pkl"

with open(PATH, 'wb') as file:
  pickle.dump(gs, file)
'''

In [None]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

In [None]:
'''
X = tweet_df.loc[:, ['Clean_Word_Lists']].to_numpy().ravel()
y = tweet_df.loc[:, 'label'].to_numpy().ravel()

LogisticRegression = LogisticRegression()

random_os = RandomOverSampler()

model_pipeline = Pipeline([("ft_word_vectorizer", MeanEmbeddingVectorizer(word_vector_dictionary)),
                           ('overSampler', random_os),
                           ('Logistic_Reg', LogisticRegression)])

parameter_grid = [{'Logistic_Reg__penalty': ['l1', 'l2'], 
                   'Logistic_Reg__C' : np.logspace(start=-1,stop=1, base=10, num=10), 
                   'Logistic_Reg__solver': ['liblinear'],
                   'Logistic_Reg__fit_intercept' : [True, False],
                   'Logistic_Reg__class_weight' : [{0 : 1, 1: 8}, {0 : 1, 1: 5}],
                   'overSampler__sampling_strategy' : ['auto', 0.6]}]

score_types = {'f1_score' : make_scorer(f1_score), 'sensitivity' : make_scorer(recall_score), 'specificity' : make_scorer(recall_score, pos_label=0),
               'AUC_ROC' : 'roc_auc', 'ROC_AUC_Score' : make_scorer(roc_auc_score), 'accuracy' : 'accuracy', 'precision' : make_scorer(precision_score)}

gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring=score_types, refit='f1_score', n_jobs=-1)

gs.fit(X,y)

PATH = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_lr_ros_mev_ft.pkl"

with open(PATH, 'wb') as file:
  pickle.dump(gs, file)
'''

In [None]:
'''
X = tweet_df.loc[:, ['Clean_Word_Lists']].to_numpy().ravel()
y = tweet_df.loc[:, 'label'].to_numpy().ravel()

LogisticRegression = LogisticRegression()

random_os = RandomOverSampler()

model_pipeline = Pipeline([("ft_word_vectorizer", MeanEmbeddingVectorizer(word_vector_dictionary)),
                           ('overSampler', random_os),
                           ('Logistic_Reg', LogisticRegression)])

parameter_grid = [{'Logistic_Reg__penalty': ['l1'], 
                   'Logistic_Reg__C' : [0.5, 1.0, 2.0, 5.0, 10.0], 
                   'Logistic_Reg__solver': ['liblinear'],
                   'Logistic_Reg__fit_intercept' : [True, False],
                   'Logistic_Reg__class_weight' : [None, {0 : 1, 1: 2}],
                   'overSampler__sampling_strategy' : ['auto', 0.3, 0.4]}]

score_types = {'f1_score' : make_scorer(f1_score), 'sensitivity' : make_scorer(recall_score), 'specificity' : make_scorer(recall_score, pos_label=0),
               'AUC_ROC' : 'roc_auc', 'ROC_AUC_Score' : make_scorer(roc_auc_score), 'accuracy' : 'accuracy', 'precision' : make_scorer(precision_score)}

gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring=score_types, refit='f1_score', n_jobs=-1)

gs.fit(X,y)

PATH = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_lr_ros2_mev_ft.pkl"

with open(PATH, 'wb') as file:
  pickle.dump(gs, file)
'''