<a href="https://colab.research.google.com/github/BradenAnderson/Twitter-Sentiment-Analysis/blob/main/06_fastText_MEV_Modeling_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## This notebook displays the gridsearch results for all of the models created in the 06 series.

- 06 series models all utilize fastText word vectors that are converted to document vectors using the custom mean embedding vectorizer class as the inputs to the supervised models.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import dill as pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split, cross_validate, cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.compose import ColumnTransformer

from sklearn.metrics import precision_score, recall_score, accuracy_score, SCORERS, multilabel_confusion_matrix, make_scorer, roc_curve, roc_auc_score, f1_score

pd.set_option('display.max_rows', 1000)

In [None]:
# This is a helper function that stream lines to process of converting a gridsearch output to a pandas dataframe with the 
# columns formatted the way I like them. 
def gs_to_clean_df(search_results, keep_split = False, keep_std = False, keep_time = False, keep_params = False, sort_by=None):

  gs_results_df = pd.DataFrame(search_results)

  gs_result_columns = list(gs_results_df.columns)
  throw_away_columns = []
  columns_to_keep = []
  columns_renamed = []
  valid_metrics = []

  for column_name in gs_result_columns: 

    if column_name.startswith('split'):
      if keep_split == True: 
        columns_to_keep.append(column_name)
      else: 
        throw_away_columns.append(column_name)
    elif 'time' in column_name: 
      if keep_time == True: 
        columns_to_keep.append(column_name)
      else: 
        throw_away_columns.append(column_name)
    elif column_name.startswith('std'):
      if keep_std == True: 
        columns_to_keep.append(column_name)
      else: 
        throw_away_columns.append(column_name)
    elif column_name == 'params':
      if keep_params == True:
        columns_to_keep.append(column_name)
      else:
        throw_away_columns.append(column_name)
    else: 
      columns_to_keep.append(column_name)

  gs_results_df.drop(labels=throw_away_columns, axis='columns', inplace=True)
  renaming_dict = {}

  for column_name in columns_to_keep: 
    name = ""

    if column_name.startswith('param') and column_name != 'params': 
      name_components = column_name.split('__')

      name_components = name_components[1:]

      for component in name_components:
        name = name + '_' + component 
      name = name.lstrip('_')

    elif '_test' in column_name:
      name = column_name.replace('_test', '')

    renaming_dict[column_name] = name

    if name.startswith('rank') or name.startswith('mean'):
      valid_metrics.append(name)

  gs_results_df.rename(columns=renaming_dict, inplace=True)

  if sort_by in valid_metrics:
    gs_results_df.sort_values(by=sort_by, inplace=True, ignore_index=True)

  return gs_results_df


In [None]:
# Custom class to be used in a Scikit-Learn pipeline to allow fastText word vectors to be used as inputs to normal
# Scikit-Learn supervised learning models. 
class MeanEmbeddingVectorizer(object):

    def __init__(self, ft_wv):
        self.ft_wv = ft_wv
        if len(ft_wv)>0:
            self.dim = ft_wv[next(iter(unique_words))].shape[0] 
        else:
            self.dim=0
            
    def fit(self, X, y):
        return self 

    def transform(self, X):

      mean_of_word_vectors = []

      # For each tweet in our training set (that we have converted to a list of clean word tokens).
      for clean_tweet_token_list in X: 
        
        # Create a list to hold the FastText word vectors for each word in the tweet.
        this_tweet_word_vectors = []

        # For every word in a particular tweet.
        for word in clean_tweet_token_list:
          
          # Initialize the word vector for this word to be a 100-dimension vector of all zeros.
          this_words_vector = np.zeros(self.dim)
          
          # If we have a valid word vector for this word, use the valid word vector instead of the zero vector.
          if word in self.ft_wv: 
            this_words_vector = self.ft_wv[word]
          
          # Append this word vector to the list of word vectors that make up this tweet.
          this_tweet_word_vectors.append(this_words_vector)

        # Create a single 100 dimensional vector that is the (element wise) mean of all word vectors that make up this tweet.
        mean_vector = np.mean(this_tweet_word_vectors, axis=0)

        # Append this mean vector to the list of mean vectors (this list will have one mean vector for every tweet in our dataset).
        mean_of_word_vectors.append(mean_vector)
      
      return np.array(mean_of_word_vectors)

## Gradient Boosted Random Forest with fastText mean embedded word vectors used as inputs

In [None]:
full_path = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_bf_ft.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

boosted_forest_ft_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

boosted_forest_ft_df.head()

Unnamed: 0,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.8,0.07,4,1000,1.0,0.646232,1,0.536581,1,0.990634,173,0.941393,169,0.763608,1,0.958747,45,0.812505,109
1,0.8,0.07,4,1000,0.7,0.646232,1,0.536581,1,0.990634,173,0.941393,169,0.763608,1,0.958747,45,0.812505,109
2,0.8,0.07,4,1200,1.0,0.64532,3,0.535688,3,0.990601,181,0.941202,171,0.763145,3,0.958653,53,0.81176,115
3,0.8,0.07,4,1200,0.7,0.64532,3,0.535688,3,0.990601,181,0.941202,171,0.763145,3,0.958653,53,0.81176,115
4,1.0,0.07,-1,1000,0.7,0.644783,5,0.523201,55,0.992487,37,0.944971,41,0.757844,45,0.95953,1,0.840464,23


## - Gradient Boosted Random Forest using mean embedded fastText word vectors as inputs.

## - Random oversampling applied to the minority class.



In [None]:
full_path = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_bf_ros_ft.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

boosted_forest_ros_ft_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

boosted_forest_ros_ft_df.head()

Unnamed: 0,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,sampling_strategy,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.8,0.03,-1,1250,0.7,0.4,0.666328,1,0.592337,116,0.985985,72,0.945416,3,0.789161,111,0.95834,34,0.76178,68
1,0.8,0.03,-1,1000,1.0,0.4,0.66487,2,0.605265,100,0.983728,99,0.945005,11,0.794496,96,0.95715,75,0.737589,95
2,1.0,0.03,-1,1250,0.7,0.5,0.66451,3,0.606613,95,0.983459,103,0.944685,13,0.795036,94,0.956993,82,0.734766,98
3,0.8,0.03,-1,1250,1.0,0.5,0.664354,4,0.6017,103,0.984166,94,0.945405,4,0.792933,101,0.957306,72,0.741611,91
4,1.0,0.03,-1,1000,0.7,0.4,0.66388,5,0.610626,92,0.982717,109,0.945055,9,0.796672,89,0.956586,92,0.727737,105


## Extra Random Forest with mean embedded fastText word vectors as inputs.

In [None]:
full_path = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_erf_ft.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

boosted_forest_ft_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

boosted_forest_ft_df.head()

Unnamed: 0,class_weight,max_depth,min_samples_leaf,min_samples_split,n_estimators,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,"{0: 1, 1: 5}",9,2,3,300,0.5632,1,0.608843,1,0.958225,69,0.931827,4,0.783534,1,0.933688,63,0.524106,63
1,"{0: 1, 1: 5}",9,1,3,600,0.562466,2,0.606168,7,0.958529,63,0.931827,5,0.782349,5,0.933782,61,0.524804,62
2,"{0: 1, 1: 5}",9,1,3,100,0.562277,3,0.607056,4,0.958326,68,0.931326,19,0.782691,4,0.933657,64,0.523879,64
3,"{0: 1, 1: 5}",9,2,2,1000,0.561833,4,0.607505,3,0.958091,71,0.931846,3,0.782798,3,0.933469,67,0.522756,68
4,"{0: 1, 1: 5}",9,1,2,600,0.561509,5,0.605275,9,0.958428,64,0.93178,7,0.781851,8,0.933626,65,0.523837,65


## Stochiastic Gradient Descent with mean embedded fastText word vectors as inputs

In [None]:
full_path =  "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_sgd_ft.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

sga_ft_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

sga_ft_df.head()

# Observations:
#
# The mean_f1 score is suprisingly low. All of the top models used the adaptive learning rate scheme, and also selected
# the highest et0 available (highest initial learning rate). The adaptive scheme systemically diminishes the learning
# rate if improvements are not made fast enough. I wonder if the initial value was too low and this resulted in a very
# small learning rate by the end of training. I want to try running this again with a higher intial learning rate. 

Unnamed: 0,alpha,class_weight,learning_rate,loss,eta0,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.0001,"{0: 1, 1: 5}",adaptive,modified_huber,0.05,0.537114,1,0.671729,48,0.937338,41,0.920974,1,0.804533,33,0.918684,39,0.447567,40
1,0.0001,"{0: 1, 1: 5}",adaptive,hinge,0.05,0.530625,2,0.668602,50,0.935654,48,0.917686,8,0.802128,36,0.916899,43,0.43997,43
2,0.001,"{0: 1, 1: 5}",adaptive,hinge,0.01,0.527619,3,0.66682,52,0.934946,51,0.91567,25,0.800883,38,0.916116,47,0.436643,49
3,0.001,"{0: 1, 1: 5}",adaptive,hinge,0.05,0.526707,4,0.665041,53,0.935014,50,0.915856,23,0.800027,39,0.916053,49,0.43616,50
4,0.001,"{0: 1, 1: 5}",adaptive,modified_huber,0.05,0.526533,5,0.659684,56,0.936058,46,0.918358,5,0.797871,42,0.916648,44,0.438272,46


## - Stochiastic Gradient Descent with mean embedded fastText word vectors as inputs

## - Random over sampling of the minority class

In [None]:
full_path =  "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_sgd_adaptive_ros_ft.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

sga_adaptive_ros_ft_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

sga_adaptive_ros_ft_df.head()

# Still an unacceptably low F1-score after adjusting parameters and adding over sampling.

Unnamed: 0,alpha,class_weight,eta0,learning_rate,loss,sampling_strategy,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.0001,,0.25,adaptive,modified_huber,0.4,0.538605,1,0.688672,65,0.934373,3,0.921457,10,0.811523,53,0.917118,2,0.442308,1
1,0.0001,,0.4,adaptive,modified_huber,0.4,0.537508,2,0.685547,66,0.934609,2,0.921719,4,0.810078,54,0.917118,2,0.442161,3
2,0.0001,,0.1,adaptive,modified_huber,0.4,0.536917,3,0.68332,67,0.934879,1,0.92155,7,0.809099,55,0.917212,1,0.44224,2
3,0.0001,,0.05,adaptive,modified_huber,0.4,0.533003,4,0.679304,68,0.934272,4,0.921313,16,0.806788,56,0.916366,4,0.438654,4
4,0.001,,0.25,adaptive,modified_huber,0.4,0.524758,5,0.673951,70,0.932386,5,0.918227,37,0.803168,61,0.914236,5,0.429764,5


# Multilayer Perceptron with fastText

In [None]:
full_path =  "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_mlp_ft.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

mlp_ft_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

mlp_ft_df.head()

Unnamed: 0,activation,alpha,hidden_layer_sizes,learning_rate,solver,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,relu,0.0001,300,,,0.621795,1,0.554427,1,0.982751,24,0.934085,11,0.768589,1,0.95267,5,0.709519,24
1,relu,0.0005,300,,,0.612246,2,0.51427,4,0.987468,21,0.934084,12,0.750869,4,0.954236,1,0.758094,9
2,relu,0.0005,200,,,0.609654,3,0.519177,3,0.986356,22,0.935332,9,0.752766,3,0.953547,3,0.749546,13
3,relu,0.0001,200,,,0.606652,4,0.522763,2,0.984975,23,0.934838,10,0.753869,2,0.952514,6,0.729419,17
4,relu,0.0005,100,,,0.598133,5,0.492439,5,0.988512,20,0.935547,7,0.740475,5,0.953673,2,0.766245,7


In [None]:
full_path =  "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_mlp_ros_ft.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

mlp_ros_ft_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

mlp_ros_ft_df.head()

# Still an unacceptably low F1-score after adjusting parameters and adding over sampling.

Unnamed: 0,activation,alpha,hidden_layer_sizes,sampling_strategy,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,relu,0.0001,400,0.4,0.624093,1,0.631167,4,0.970354,1,0.931006,1,0.800761,4,0.946531,1,0.622121,1
1,relu,5e-05,400,0.4,0.61716,2,0.658337,3,0.963784,2,0.930937,2,0.81106,3,0.942334,2,0.584626,2
2,relu,5e-05,400,auto,0.61437,3,0.679317,1,0.959707,3,0.928067,4,0.819512,1,0.940016,3,0.561752,3
3,relu,0.0001,400,auto,0.60144,4,0.678896,2,0.956103,4,0.929418,3,0.817499,2,0.936633,4,0.544603,4


## Logistic Regression with fast text mean embedded word vectors as inputs.

In [None]:
full_path =  "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_lr_mev_ft.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

lr_ft_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

lr_ft_df.head()

Unnamed: 0,C,class_weight,fit_intercept,penalty,solver,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,68.1292,"{0: 1, 1: 5}",False,l1,liblinear,0.547847,1,0.676637,23,0.940067,128,0.924014,16,0.808352,23,0.921566,110,0.460269,105
1,10.0,"{0: 1, 1: 5}",False,l1,liblinear,0.547365,2,0.673513,31,0.940538,121,0.923873,21,0.807025,27,0.921785,104,0.461054,104
2,21.5443,"{0: 1, 1: 5}",False,l1,liblinear,0.547304,3,0.677081,22,0.939797,134,0.924016,15,0.808439,20,0.921347,112,0.459293,111
3,31.6228,"{0: 1, 1: 5}",False,l1,liblinear,0.547279,4,0.677528,21,0.939696,137,0.923996,18,0.808612,19,0.921284,115,0.459066,113
4,6.81292,"{0: 1, 1: 5}",False,l1,liblinear,0.546764,5,0.673513,30,0.940336,124,0.923677,24,0.806924,28,0.921597,106,0.460221,106


## Logistic Regression with fastText mean embedded word vectors as inputs.

In [None]:
full_path =  "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_lr_ros_mev_ft.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

lr_ros_ft_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

lr_ros_ft_df.head()

Unnamed: 0,C,class_weight,fit_intercept,penalty,solver,sampling_strategy,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,5.99484,"{0: 1, 1: 5}",False,l1,liblinear,0.6,0.321267,1,0.939792,156,0.704545,1,0.919336,49,0.822168,2,0.721065,1,0.19377,1
1,10.0,"{0: 1, 1: 5}",False,l1,liblinear,0.6,0.320538,2,0.938006,159,0.704242,2,0.919904,34,0.821124,4,0.720658,2,0.193316,2
2,5.99484,"{0: 1, 1: 5}",True,l1,liblinear,0.6,0.320323,3,0.942911,150,0.701951,5,0.92294,1,0.822431,1,0.718872,5,0.192958,3
3,2.15443,"{0: 1, 1: 5}",False,l1,liblinear,0.6,0.320023,4,0.939341,158,0.702894,3,0.918614,56,0.821118,5,0.719499,3,0.192894,4
4,10.0,"{0: 1, 1: 5}",True,l1,liblinear,0.6,0.319593,5,0.939343,157,0.702355,4,0.922512,3,0.820849,6,0.718998,4,0.192579,5


## Logistic Regression with fastText mean embedded word vectors as inputs.

In [None]:
full_path =  "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch_ft/gs_lr_ros2_mev_ft.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

lr_ros2_ft_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

lr_ros2_ft_df.head()

Unnamed: 0,C,class_weight,fit_intercept,penalty,solver,sampling_strategy,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,10,,False,l1,liblinear,0.3,0.558221,1,0.626233,55,0.953374,4,0.92283,25,0.789803,54,0.930399,2,0.503623,2
1,5,,False,l1,liblinear,0.3,0.558133,2,0.624895,56,0.95361,2,0.923145,19,0.789252,55,0.930525,1,0.504352,1
2,2,,False,l1,liblinear,0.3,0.555209,3,0.619988,58,0.953677,1,0.921328,38,0.786833,57,0.930243,3,0.50283,3
3,10,,True,l1,liblinear,0.3,0.554012,4,0.642735,51,0.948826,7,0.924017,8,0.79578,51,0.92733,6,0.486914,6
4,5,,True,l1,liblinear,0.3,0.55243,5,0.637834,52,0.949298,6,0.923561,14,0.793566,52,0.927424,5,0.487344,5
