<a href="https://colab.research.google.com/github/BradenAnderson/Twitter-Sentiment-Analysis/blob/main/07_MLP_Models_TFIDF_Ft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### This notebook contains the code to perform hyperparameter tuning on Multilayer Perceptron Models that utilize custom TFIDF weighted fastText word vectors as their inputs.

### Displaying and reviewing the search results is done in the 07_fastText_TFIDF_Modeling_Analysis notebook.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! git clone https://github.com/facebookresearch/fastText.git

In [None]:
! pip install /content/fastText

In [None]:
import dill as pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import fasttext
from fasttext.FastText import load_model

from collections import defaultdict

from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split, cross_validate, cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, SCORERS, multilabel_confusion_matrix, make_scorer, roc_curve, roc_auc_score, f1_score

pd.set_option('display.max_rows', 1000)

In [None]:
filepath= "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/vader_full_preprocessing_model_droppedlt3.csv"

tweet_df = pd.read_csv(filepath)

tweet_df.head()

Unnamed: 0,label,tweet,Clean_Tweet,Sentence_Level_pos_Score,Sentence_Level_neg_Score,Sentence_Level_neu_Score,Sentence_Level_compound_Score
0,0,@user when a father is dysfunctional and is s...,father dysfunctional significant selfish pron ...,0.0,0.211,0.789,0.5852
1,0,@user @user thanks for #lyft credit i can't us...,thank #lyft credit use cause pron offer wheelc...,0.157,0.0,0.843,1.33525
2,0,bihday your majesty,bihday pron majesty,0.0,0.0,1.0,1.0
3,0,#model i love u take with u all the time in ...,#model love pron pron time pron happy love hap...,0.194,0.0,0.806,1.36245
4,0,factsguide: society now #motivation,factsguide society #motivation,0.0,0.0,1.0,1.0


In [None]:
tweet_df['Clean_Word_Lists'] = tweet_df['Clean_Tweet'].apply(lambda tweet : tweet.split(' '))

# Set up fastText

In [None]:
# Read in a csv file that contains a string with every unqiue word found in the set of tweets.
unique_df = pd.read_csv("/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/unique_words.csv")

# Grab the long string of unqiue words from the dataframe.
unique_words = unique_df.loc[unique_df.index == 0, 'Unique_Words'].to_numpy()[0]

# Split the string at white spaces to get a list of unique words.
unique_words = unique_words.split(" ")

In [None]:
# Path to where the fastText word vector model is saved.
word_vector_model_filepath = r"/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/fastText_Models/wv_model_dlt3.bin"

# Load the model of word vector representations that was trained in the previous section.
ft_model = load_model(word_vector_model_filepath)

# Create a dictionary mapping each unique word to its fastText word vector.
word_vector_dictionary = {word : ft_model.get_word_vector(word) for word in unique_words}

In [None]:
class TfidfEmbeddingVectorizer(object):

    def __init__(self, ft_wv):

        self.ft_wv = ft_wv
        self.word2weight = None

        if len(ft_wv) > 0:
            self.dim = ft_wv[next(iter(unique_words))].shape[0]
        else:
            self.dim=0
        
    def fit(self, X, y):

        tfidf = TfidfVectorizer(analyzer=lambda word : word)
        tfidf.fit(X)

        #-------------------------------------------------------------------------------------------------------------------------------------
        # The .idf_ attribute is a vector that contains the inverse document frequency values for each word in the vocabuary.
        #
        # The .vocabulary_ attribute is a list of tuples of the form (word, index) where index is the location in the .idf_ list 
        # where the inverse document frequency value for that word is stored.
        #
        # As the number of documents a particular word shows up in increases, its idf(t) (.idf_) value will decrease. 
        # idf(t) = log( (1+n) / (1 + df(t)) ) + 1, where n = number of documents (tweets) and df(t) = number of documents that contain word t. 
        # --------------------------------------------------------------------------------------------------------------------------------------
        
        # Save the maximum value in the list of inverse document frequencies. This corresponds to the word that shows up in the
        # least number of documents (tweets). This can be used as a default value to return if we ever try to find the inverse document frequency
        # for a word that was not in the training set (the assumption being made is that if a word was not in the training set, that it
        # is at least as uncommon as the most uncommon word in the training set). 
        max_idf = max(tfidf.idf_)

        # 1) Use the .idf_ list and the .vocabulary_ dictionary to create a new dictionary that maps each word to its idf value. 
        # 2) Since this is being created as a defaultdict, if we ever try to get the idf value using a word (key) that was not in the
        #    training set, this dictionary will return the default value (idf of the most uncommon word) rather than throw a key error.
        self.word2weight = defaultdict(lambda: max_idf,  [(word, tfidf.idf_[idf_index]) for word, idf_index in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):

      # List to hold the average tfidf fast text vector for each tweet.
      mean_tfidf_vector_for_each_tweet = []

      # for every tweet in the training data.
      for clean_tweet_token_list in X: 

        # create a list to hold the tfidf fastText vectors for each word in this tweet.
        this_tweet_tfidf_vectors = []
          
        # For every word in the tweet.
        for word in clean_tweet_token_list: 
            
          # Initialize the fastText vector for this word to be the zero vector.
          fastText_vector = np.zeros(self.dim)

          # Check if we have a fastText word vector for this word, if we do, update fasText_vector to be the correct value.
          if word in self.ft_wv:
            fastText_vector = self.ft_wv[word]

          # Calculate the tfidf_vector as the words fastText vector multiplied by its idf weight.
          tfidf_vector = fastText_vector * self.word2weight[word] 

          # Add the tfidf_vector for this word to the list of tfidf_vectors for this tweet.
          this_tweet_tfidf_vectors.append(tfidf_vector)

        # To get a single vector that represents the entire tweet, take the mean of the tfidf vectors for all words in the tweet.
        mean_tfidf_vector = np.mean(this_tweet_tfidf_vectors, axis=0)
        
        # Add the mean tfidf vector for this tweet to the list of mean tfidf vectors for all tweets.
        mean_tfidf_vector_for_each_tweet.append(mean_tfidf_vector)

      return np.array(mean_tfidf_vector_for_each_tweet)


# Perform GridSearch

In [None]:
'''
X = tweet_df.loc[:, ['Clean_Word_Lists']].to_numpy().ravel()
y = tweet_df.loc[:, 'label'].to_numpy().ravel()

multi_layer_perceptron = MLPClassifier()

model_pipeline = Pipeline([("ft_word_vectorizer", TfidfEmbeddingVectorizer(word_vector_dictionary)),
                           ('MLP', multi_layer_perceptron)])

parameter_grid = [{'MLP__hidden_layer_sizes' : [100, 200, 300],
                   'MLP__activation' : ['relu', 'logistic'], 
                   'MLP__alpha' : [0.0001, 0.0005]}, 
                  {'MLP__hidden_layer_sizes' : [100, 200, 300],
                   'MLP__activation' : ['relu', 'logistic'],
                   'MLP__solver' : ['sgd'], 
                   'MLP__learning_rate' : ['adaptive'], 
                   'MLP__alpha' : [0.0001, 0.0005]}]

score_types = {'f1_score' : make_scorer(f1_score), 'sensitivity' : make_scorer(recall_score), 'specificity' : make_scorer(recall_score, pos_label=0),
               'AUC_ROC' : 'roc_auc', 'ROC_AUC_Score' : make_scorer(roc_auc_score), 'accuracy' : 'accuracy', 'precision' : make_scorer(precision_score)}

gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring=score_types, refit='f1_score', n_jobs=-1)

gs.fit(X,y)

PATH = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_mlp_tfidf_ft.pkl"

with open(PATH, 'wb') as file:
  pickle.dump(gs, file)
'''

In [None]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

In [None]:
'''
X = tweet_df.loc[:, ['Clean_Word_Lists']].to_numpy().ravel()
y = tweet_df.loc[:, 'label'].to_numpy().ravel()

multi_layer_perceptron = MLPClassifier()

random_os = RandomOverSampler()

model_pipeline = Pipeline([("ft_word_vectorizer", TfidfEmbeddingVectorizer(word_vector_dictionary)),
                           ('overSampler', random_os),
                           ('MLP', multi_layer_perceptron)])

parameter_grid = [{'MLP__hidden_layer_sizes' : [300, 350, 400],
                   'MLP__activation' : ['logistic'], 
                   'MLP__alpha' : [0.0004, 0.0005, 0.0006],
                   'overSampler__sampling_strategy' : ['auto', 0.6, 0.4]}]

score_types = {'f1_score' : make_scorer(f1_score), 'sensitivity' : make_scorer(recall_score), 'specificity' : make_scorer(recall_score, pos_label=0),
               'AUC_ROC' : 'roc_auc', 'ROC_AUC_Score' : make_scorer(roc_auc_score), 'accuracy' : 'accuracy', 'precision' : make_scorer(precision_score)}

gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring=score_types, refit='f1_score', n_jobs=-1)

gs.fit(X,y)

PATH = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_mlp_ros_tfidf_ft.pkl"

with open(PATH, 'wb') as file:
  pickle.dump(gs, file)
'''