<a href="https://colab.research.google.com/github/BradenAnderson/Twitter-Sentiment-Analysis/blob/main/07_fastText_TFIDF_Modeling_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## This notebook displays the gridsearch results for all of the models in the 07 series.

- 07 series models all utilize fastText word vectors and the custom TFIDF embedding vectorizer class to create the document vector that is used as the supervised learning model input.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import dill as pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split, cross_validate, cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.compose import ColumnTransformer

from sklearn.metrics import precision_score, recall_score, accuracy_score, SCORERS, multilabel_confusion_matrix, make_scorer, roc_curve, roc_auc_score, f1_score

pd.set_option('display.max_rows', 1000)

In [3]:
# This is a helper function that stream lines the process of converting a gridsearch output to a pandas dataframe with the 
# columns formatted the way I like them. 
def gs_to_clean_df(search_results, keep_split = False, keep_std = False, keep_time = False, keep_params = False, sort_by=None):

  gs_results_df = pd.DataFrame(search_results)

  gs_result_columns = list(gs_results_df.columns)
  throw_away_columns = []
  columns_to_keep = []
  columns_renamed = []
  valid_metrics = []

  for column_name in gs_result_columns: 

    if column_name.startswith('split'):
      if keep_split == True: 
        columns_to_keep.append(column_name)
      else: 
        throw_away_columns.append(column_name)
    elif 'time' in column_name: 
      if keep_time == True: 
        columns_to_keep.append(column_name)
      else: 
        throw_away_columns.append(column_name)
    elif column_name.startswith('std'):
      if keep_std == True: 
        columns_to_keep.append(column_name)
      else: 
        throw_away_columns.append(column_name)
    elif column_name == 'params':
      if keep_params == True:
        columns_to_keep.append(column_name)
      else:
        throw_away_columns.append(column_name)
    else: 
      columns_to_keep.append(column_name)

  gs_results_df.drop(labels=throw_away_columns, axis='columns', inplace=True)
  renaming_dict = {}

  for column_name in columns_to_keep: 
    name = ""

    if column_name.startswith('param') and column_name != 'params': 
      name_components = column_name.split('__')

      name_components = name_components[1:]

      for component in name_components:
        name = name + '_' + component 
      name = name.lstrip('_')

    elif '_test' in column_name:
      name = column_name.replace('_test', '')

    renaming_dict[column_name] = name

    if name.startswith('rank') or name.startswith('mean'):
      valid_metrics.append(name)

  gs_results_df.rename(columns=renaming_dict, inplace=True)

  if sort_by in valid_metrics:
    gs_results_df.sort_values(by=sort_by, inplace=True, ignore_index=True)

  return gs_results_df


In [4]:
class TfidfEmbeddingVectorizer(object):

    def __init__(self, ft_wv):

        self.ft_wv = ft_wv
        self.word2weight = None

        if len(ft_wv) > 0:
            self.dim = ft_wv[next(iter(unique_words))].shape[0]
        else:
            self.dim=0
        
    def fit(self, X, y):

        tfidf = TfidfVectorizer(analyzer=lambda word : word)
        tfidf.fit(X)

        #-------------------------------------------------------------------------------------------------------------------------------------
        # The .idf_ attribute is a vector that contains the inverse document frequency values for each word in the vocabuary.
        #
        # The .vocabulary_ attribute is a list of tuples of the form (word, index) where index is the location in the .idf_ list 
        # where the inverse document frequency value for that word is stored.
        #
        # As the number of documents a particular word shows up in increases, its idf(t) (.idf_) value will decrease. 
        # idf(t) = log( (1+n) / (1 + df(t)) ) + 1, where n = number of documents (tweets) and df(t) = number of documents that contain word t. 
        # --------------------------------------------------------------------------------------------------------------------------------------
        
        # Save the maximum value in the list of inverse document frequencies. This corresponds to the word that shows up in the
        # least number of documents (tweets). This can be used as a default value to return if we ever try to find the inverse document frequency
        # for a word that was not in the training set (the assumption being made is that if a word was not in the training set, that it
        # is at least as uncommon as the most uncommon word in the training set). 
        max_idf = max(tfidf.idf_)

        # 1) Use the .idf_ list and the .vocabulary_ dictionary to create a new dictionary that maps each word to its idf value. 
        # 2) Since this is being created as a defaultdict, if we ever try to get the idf value using a word (key) that was not in the
        #    training set, this dictionary will return the default value (idf of the most uncommon word) rather than throw a key error.
        self.word2weight = defaultdict(lambda: max_idf,  [(word, tfidf.idf_[idf_index]) for word, idf_index in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):

      # List to hold the average tfidf fast text vector for each tweet.
      mean_tfidf_vector_for_each_tweet = []

      # for every tweet in the training data.
      for clean_tweet_token_list in X: 

        # create a list to hold the tfidf fastText vectors for each word in this tweet.
        this_tweet_tfidf_vectors = []
          
        # For every word in the tweet.
        for word in clean_tweet_token_list: 
            
          # Initialize the fastText vector for this word to be the zero vector.
          fastText_vector = np.zeros(self.dim)

          # Check if we have a fastText word vector for this word, if we do, update fasText_vector to be the correct value.
          if word in self.ft_wv:
            fastText_vector = self.ft_wv[word]

          # Calculate the tfidf_vector as the words fastText vector multiplied by its idf weight.
          tfidf_vector = fastText_vector * self.word2weight[word] 

          # Add the tfidf_vector for this word to the list of tfidf_vectors for this tweet.
          this_tweet_tfidf_vectors.append(tfidf_vector)

        # To get a single vector that represents the entire tweet, take the mean of the tfidf vectors for all words in the tweet.
        mean_tfidf_vector = np.mean(this_tweet_tfidf_vectors, axis=0)
        
        # Add the mean tfidf vector for this tweet to the list of mean tfidf vectors for all tweets.
        mean_tfidf_vector_for_each_tweet.append(mean_tfidf_vector)

      return np.array(mean_tfidf_vector_for_each_tweet)


# Gradient Boosted Random Forest with fastText TFIDF vectors

In [5]:

full_path = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_bf_tfidf_ft.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

boosted_forest_ft_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

boosted_forest_ft_df.head()


Unnamed: 0,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.8,0.05,-1,600,0.7,0.647724,1,0.52944,1,0.992049,65,0.945002,9,0.760745,1,0.959561,3,0.834308,45
1,0.8,0.05,-1,600,1.0,0.647724,1,0.52944,1,0.992049,65,0.945002,9,0.760745,1,0.959561,3,0.834308,45
2,0.8,0.07,-1,600,0.7,0.645282,3,0.526326,9,0.992083,55,0.943485,57,0.759205,7,0.959374,9,0.834333,43
3,0.8,0.07,-1,600,1.0,0.645282,3,0.526326,9,0.992083,55,0.943485,57,0.759205,7,0.959374,9,0.834333,43
4,0.8,0.07,-1,1200,1.0,0.644599,5,0.521861,49,0.992656,27,0.941865,121,0.757258,39,0.959593,1,0.843207,19


In [13]:
full_path = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_bf_ros_tfidf_ft.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

boosted_forest_ros_ft_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

boosted_forest_ros_ft_df.head()

Unnamed: 0,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,sampling_strategy,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.75,0.05,-1,700,0.8,0.6,0.667631,1,0.621327,167,0.981875,20,0.944278,4,0.801601,158,0.956554,5,0.721653,17
1,0.75,0.05,-1,600,0.7,0.6,0.663599,2,0.634263,126,0.979045,64,0.942297,176,0.806654,117,0.954832,46,0.69601,59
2,0.75,0.05,-1,650,0.8,0.6,0.663534,3,0.626228,151,0.980258,39,0.942969,100,0.803243,148,0.955395,30,0.705625,36
3,0.85,0.05,-1,650,0.8,0.6,0.663368,4,0.626674,148,0.980157,45,0.942923,107,0.803416,145,0.955333,32,0.704672,38
4,0.85,0.06,-1,700,0.8,0.6,0.663251,5,0.605258,203,0.983391,3,0.943273,51,0.794324,202,0.956836,1,0.733638,3


In [7]:
# Results did not improve from above, no need to continue tuning. 

full_path = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_bf_ros2_tfidf_ft.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

boosted_forest_ros2_ft_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

boosted_forest_ros2_ft_df.head()

Unnamed: 0,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,sampling_strategy,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.75,0.05,-1,700,0.85,0.5,0.666859,1,0.611954,33,0.983122,19,0.943035,41,0.797538,28,0.957056,7,0.73276,14
1,0.8,0.05,-1,750,0.75,0.5,0.666429,2,0.605717,49,0.983964,7,0.943898,8,0.794841,47,0.9574,4,0.741082,5
2,0.75,0.05,-1,750,0.75,0.6,0.665044,3,0.61374,29,0.982482,37,0.942365,76,0.798111,24,0.956586,14,0.725843,30
3,0.8,0.05,-1,800,0.8,0.5,0.664641,4,0.599466,72,0.98457,3,0.943948,5,0.792018,68,0.957525,2,0.745851,3
4,0.8,0.05,-1,800,0.85,0.5,0.664409,5,0.597689,74,0.984772,1,0.94314,37,0.791231,74,0.957588,1,0.748139,1


# Extra Random Forest with fastText TFIDF vectors

In [8]:

full_path = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_erf_tfidf_ft.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

boosted_forest_ft_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

boosted_forest_ft_df.head()


Unnamed: 0,class_weight,max_depth,min_samples_leaf,min_samples_split,n_estimators,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,"{0: 1, 1: 5}",9,1,2,200,0.558618,1,0.606167,4,0.957417,62,0.930926,10,0.781792,4,0.932749,62,0.518104,62
1,"{0: 1, 1: 5}",9,2,2,200,0.558478,2,0.608843,1,0.956844,77,0.930613,15,0.782844,1,0.932404,63,0.515946,63
2,"{0: 1, 1: 5}",9,2,2,1000,0.557903,3,0.607947,2,0.956844,75,0.93099,4,0.782396,2,0.932341,68,0.515586,68
3,"{0: 1, 1: 5}",9,1,2,300,0.55782,4,0.607497,3,0.956911,72,0.931101,2,0.782204,3,0.932373,65,0.515682,67
4,"{0: 1, 1: 5}",9,2,3,1000,0.556957,5,0.606163,6,0.956911,71,0.930853,12,0.781537,5,0.932279,69,0.515257,69


# Stochiastic Gradient Descent with fastText TFIDF vectors

In [9]:
full_path =  "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_sgd_tfidf_ft.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

sda_ft_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

sda_ft_df.head()

Unnamed: 0,alpha,class_weight,learning_rate,loss,eta0,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.0001,"{0: 1, 1: 5}",adaptive,hinge,0.01,0.544898,1,0.670387,56,0.940302,41,0.91428,23,0.805345,20,0.921347,40,0.459044,42
1,0.001,"{0: 1, 1: 5}",adaptive,modified_huber,0.001,0.543243,2,0.656116,62,0.942627,36,0.917228,6,0.799371,34,0.922506,35,0.463562,38
2,0.0001,"{0: 1, 1: 5}",adaptive,modified_huber,0.001,0.541879,3,0.656565,60,0.942054,39,0.916962,7,0.79931,35,0.922005,37,0.461445,39
3,0.005,"{0: 1, 1: 5}",adaptive,modified_huber,0.001,0.54038,4,0.652107,64,0.942492,37,0.917388,2,0.7973,39,0.922099,36,0.461352,40
4,0.005,"{0: 1, 1: 5}",adaptive,modified_huber,0.01,0.539251,5,0.650317,68,0.942459,38,0.917275,4,0.796388,41,0.921942,38,0.460707,41


# Multilayer Perceptron with fastText TFIDF vectors

In [10]:
full_path =  "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_mlp_tfidf_ft.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

mlp_ft_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

mlp_ft_df.head(10)

Unnamed: 0,activation,alpha,hidden_layer_sizes,learning_rate,solver,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,logistic,0.0005,300,,,0.626835,1,0.564195,4,0.982178,17,0.934128,9,0.773187,4,0.952827,4,0.715318,11
1,logistic,0.0005,100,,,0.619798,2,0.562029,6,0.981101,18,0.935715,1,0.771565,5,0.951668,10,0.701035,16
2,relu,0.0005,300,,,0.619659,3,0.586088,2,0.976957,21,0.923709,16,0.781522,2,0.949507,15,0.657702,21
3,logistic,0.0001,300,,,0.617268,4,0.522313,12,0.987333,13,0.93134,11,0.754823,12,0.954675,1,0.75849,5
4,relu,0.0005,200,,,0.613905,5,0.603475,1,0.972442,24,0.925159,15,0.787958,1,0.946531,17,0.628814,23
5,logistic,0.0001,200,,,0.612367,6,0.524096,11,0.985817,14,0.930879,12,0.754957,11,0.953391,2,0.738315,8
6,logistic,0.0005,200,,,0.61137,7,0.52946,10,0.984874,15,0.931777,10,0.757167,10,0.95289,3,0.729165,9
7,logistic,0.0001,100,,,0.609671,8,0.534764,9,0.983391,16,0.935357,3,0.759077,9,0.951887,8,0.713823,12
8,relu,0.0001,100,,,0.609451,9,0.549993,8,0.980898,19,0.927961,14,0.765446,8,0.950634,13,0.687533,19
9,relu,0.0001,300,,,0.608838,10,0.557094,7,0.979517,20,0.923375,17,0.768306,7,0.949851,14,0.675872,20


In [11]:
full_path =  "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_mlp_ros_tfidf_ft.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

mlp_ros_ft_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

mlp_ros_ft_df.head()

Unnamed: 0,activation,alpha,hidden_layer_sizes,sampling_strategy,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,logistic,0.0005,400,0.6,0.614956,1,0.64004,12,0.966614,6,0.926267,9,0.803327,10,0.943681,3,0.592495,4
1,logistic,0.0004,400,0.4,0.610361,2,0.633364,18,0.966513,7,0.925087,15,0.799938,17,0.943117,5,0.589897,6
2,logistic,0.0006,300,0.6,0.609494,3,0.635151,15,0.966075,9,0.926097,12,0.800613,15,0.942835,8,0.586283,10
3,logistic,0.0006,350,0.6,0.609462,4,0.634692,16,0.966176,8,0.927353,6,0.800434,16,0.942897,6,0.586899,9
4,logistic,0.0005,300,auto,0.608561,5,0.653416,4,0.962672,23,0.926407,8,0.808044,4,0.940955,20,0.570597,22


# Logistic Regression

In [12]:
full_path = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_lr_tfidf_ft.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

lr_ft_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

lr_ft_df.head()

Unnamed: 0,C,class_weight,fit_intercept,penalty,solver,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,10.0,"{0: 1, 1: 5}",True,l2,liblinear,0.554149,1,0.686889,8,0.940168,113,0.922585,3,0.813528,8,0.922381,89,0.464434,86
1,3.79269,"{0: 1, 1: 5}",False,l1,liblinear,0.554064,2,0.687778,6,0.939966,132,0.922225,32,0.813872,4,0.922255,93,0.463902,91
2,7.8476,"{0: 1, 1: 5}",False,l2,liblinear,0.55393,3,0.686888,10,0.9401,120,0.9224,14,0.813494,9,0.922318,90,0.464129,90
3,10.0,"{0: 1, 1: 5}",False,l2,liblinear,0.553911,4,0.687334,7,0.939999,130,0.922398,16,0.813667,6,0.922255,91,0.4639,92
4,7.8476,"{0: 1, 1: 5}",False,l1,liblinear,0.553717,5,0.688228,2,0.939764,142,0.922297,23,0.813996,2,0.922099,104,0.463217,97
