<a href="https://colab.research.google.com/github/BradenAnderson/Twitter-Sentiment-Analysis/blob/main/04_Modeling_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## This notebook displays the gridsearch results for all of the models created in the 04 series. 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split, cross_validate, cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.compose import ColumnTransformer

from sklearn.metrics import precision_score, recall_score, accuracy_score, SCORERS, multilabel_confusion_matrix, make_scorer, roc_curve, roc_auc_score, f1_score
pd.set_option('display.max_rows', 1000)

In [None]:
# This is a helper function that stream lines to process of converting a gridsearch output to a pandas dataframe with the 
# columns formatted the way I like them. 
def gs_to_clean_df(search_results, keep_split = False, keep_std = False, keep_time = False, keep_params = False, sort_by=None):

  gs_results_df = pd.DataFrame(search_results)

  gs_result_columns = list(gs_results_df.columns)
  throw_away_columns = []
  columns_to_keep = []
  columns_renamed = []
  valid_metrics = []

  for column_name in gs_result_columns: 

    if column_name.startswith('split'):
      if keep_split == True: 
        columns_to_keep.append(column_name)
      else: 
        throw_away_columns.append(column_name)
    elif 'time' in column_name: 
      if keep_time == True: 
        columns_to_keep.append(column_name)
      else: 
        throw_away_columns.append(column_name)
    elif column_name.startswith('std'):
      if keep_std == True: 
        columns_to_keep.append(column_name)
      else: 
        throw_away_columns.append(column_name)
    elif column_name == 'params':
      if keep_params == True:
        columns_to_keep.append(column_name)
      else:
        throw_away_columns.append(column_name)
    else: 
      columns_to_keep.append(column_name)

  gs_results_df.drop(labels=throw_away_columns, axis='columns', inplace=True)
  renaming_dict = {}

  for column_name in columns_to_keep: 
    name = ""

    if column_name.startswith('param') and column_name != 'params': 
      name_components = column_name.split('__')

      name_components = name_components[1:]

      for component in name_components:
        name = name + '_' + component 
      name = name.lstrip('_')

    elif '_test' in column_name:
      name = column_name.replace('_test', '')

    renaming_dict[column_name] = name

    if name.startswith('rank') or name.startswith('mean'):
      valid_metrics.append(name)

  gs_results_df.rename(columns=renaming_dict, inplace=True)

  if sort_by in valid_metrics:
    gs_results_df.sort_values(by=sort_by, inplace=True, ignore_index=True)

  return gs_results_df


# Multinomial Naive Bayes 

### TFIDF Vectorizer

### The only input features are the tokenized tweet text (no vader scores).

### No handling of imbalanced classes.

### Note: This is will be used as the 'baseline' model.

In [None]:
full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/naive_bayes_gs1.pkl'
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

naive_bayes_baseline_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

naive_bayes_baseline_df.head()

Unnamed: 0,fit_prior,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,False,word,0.9,1,"(1, 1)",english,0.62288,1,0.573159,221,0.97982,256,0.922011,81,0.776489,121,0.951261,31,0.682275,171
1,False,word,1.0,1,"(1, 1)",english,0.62288,1,0.573159,221,0.97982,256,0.922011,81,0.776489,121,0.951261,31,0.682275,171
2,False,word,0.8,1,"(1, 1)",english,0.62288,1,0.573159,221,0.97982,256,0.922011,81,0.776489,121,0.951261,31,0.682275,171
3,False,word,0.95,1,"(1, 1)",english,0.62288,1,0.573159,221,0.97982,256,0.922011,81,0.776489,121,0.951261,31,0.682275,171
4,False,word,0.7,1,"(1, 1)",english,0.62288,1,0.573159,221,0.97982,256,0.922011,81,0.776489,121,0.951261,31,0.682275,171


# Multinomial Naive Bayes 

### TFIDF Vectorizer

### The only input features are the tokenized tweet text (no vader scores).

### Class imbalance addressed by random oversampling the minority class.

### Note: Same as the baseline with the addition of random oversampling.

In [None]:
full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/naive_bayes_ros_gs1.pkl'
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

naive_bayes_ros_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

In [None]:
naive_bayes_ros_df.head()

Unnamed: 0,fit_prior,sampling_strategy,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,True,0.4,word,1.0,1,"(1, 1)",english,0.72787,1,0.691345,1007,0.984267,251,0.957343,88,0.837806,310,0.963696,11,0.768601,17
1,True,0.4,word,1.0,1,"(1, 1)",,0.721921,2,0.685544,1008,0.983863,254,0.957289,89,0.834704,333,0.962913,13,0.762604,21
2,True,0.4,word,0.8,1,"(1, 1)",english,0.721914,3,0.685099,1009,0.98393,253,0.957277,90,0.834514,335,0.962944,12,0.763251,20
3,True,0.4,word,0.9,1,"(1, 1)",english,0.720176,4,0.679748,1024,0.984301,250,0.956795,99,0.832024,375,0.962913,13,0.766245,18
4,True,0.4,word,0.9,1,"(1, 1)",,0.719974,5,0.682871,1017,0.983829,255,0.957409,86,0.83335,357,0.962694,15,0.761598,22


# Multinomial Naive Bayes 

### TFIDF Vectorizer

### Inputs include vader positive, negative and compound sentment scores.

### No handling of imbalanced classes.

### Note: Same as above with the addition of vader scores features.

### Note: The compound sentiment score was shifted to fall between 0 - 2 (normally the compound score is between -1 and +1). This was done because Naive Bayes cannot take negative valued inputs.

In [None]:
full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/naive_bayes_ros_v_gs1.pkl'
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

naive_bayes_ros_v_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

In [None]:
naive_bayes_ros_v_df.head()

Unnamed: 0,fit_prior,sampling_strategy,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,True,0.6,word,0.8,1,"(1, 2)",,0.722292,1,0.699375,935,0.982077,226,0.95723,69,0.840726,274,0.962224,24,0.746846,32
1,True,0.6,word,0.7,1,"(1, 2)",english,0.721556,2,0.698483,947,0.982044,228,0.957379,66,0.840263,276,0.96213,26,0.746339,33
2,True,0.6,word,0.9,1,"(1, 2)",english,0.720772,3,0.698935,940,0.981808,232,0.957287,68,0.840371,275,0.961942,32,0.744285,36
3,True,0.6,word,0.8,1,"(1, 2)",english,0.720659,4,0.695365,967,0.982279,221,0.957433,63,0.838822,290,0.96213,28,0.747924,31
4,True,0.6,word,0.95,1,"(1, 2)",english,0.720643,5,0.697149,955,0.982044,229,0.957056,74,0.839596,281,0.962036,30,0.745866,34


# Multinomial Niave Bayes

(tuned version of two above)

In [None]:
# Result very marginally improved.

full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/naive_bayes_ros_gs2.pkl'
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

naive_bayes_ros2_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

In [None]:
naive_bayes_ros2_df.head()

Unnamed: 0,alpha,fit_prior,sampling_strategy,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.5,True,0.4,word,1.0,1,"(1, 2)",english,0.732028,1,0.671721,339,0.987636,144,0.957356,49,0.829678,325,0.96545,22,0.804357,135
1,0.5,True,0.4,word,0.9,1,"(1, 2)",english,0.731407,2,0.668603,345,0.987905,136,0.956863,74,0.828254,331,0.965482,21,0.807659,128
2,0.5,True,0.4,word,0.95,1,"(1, 2)",,0.730081,3,0.663693,369,0.98831,129,0.956733,80,0.826001,354,0.965513,20,0.811518,124
3,1.0,True,0.5,word,0.9,1,"(1, 2)",english,0.729415,4,0.695371,262,0.984031,233,0.956865,72,0.839701,234,0.963759,72,0.767178,198
4,0.5,True,0.4,word,0.85,1,"(1, 2)",english,0.729227,5,0.665926,358,0.987872,137,0.957241,55,0.826899,346,0.965262,26,0.805972,130


# Multinomial Naive Bayes

Now using synthetic minority over sampling.

In [None]:
# Utilizing SMOTE oversampling
full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/naive_bayes_smote_gs1.pkl'
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

naive_bayes_smote_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

In [None]:
naive_bayes_smote_df.head()

Unnamed: 0,alpha,fit_prior,k_neighbors,sampling_strategy,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.5,True,5,0.4,word,1.0,1,"(1, 2)",english,0.728689,1,0.665928,211,0.987737,41,0.957279,164,0.826833,211,0.965137,17,0.804999,40
1,0.5,True,1,0.4,word,1.0,1,"(1, 2)",english,0.728094,2,0.663249,216,0.988007,38,0.956992,174,0.825628,215,0.9652,14,0.807119,37
2,0.5,True,3,0.4,word,1.0,1,"(1, 2)",english,0.726902,3,0.661464,219,0.988007,39,0.957151,168,0.824735,218,0.965074,19,0.807016,38
3,0.5,True,1,0.4,word,0.85,1,"(1, 2)",english,0.726085,4,0.660126,222,0.98804,37,0.957126,170,0.824083,220,0.965012,20,0.806828,39
4,0.5,True,3,0.4,word,0.85,1,"(1, 2)",english,0.725603,5,0.665478,212,0.987232,47,0.957248,167,0.826355,212,0.964636,30,0.797918,44


# Multinomial Naive Bayes

Using SMOTE again (tuned version of above). 

In the previous search the best model used k_neighbors=5 for the over sampling technique, which was the highest value we tried. 

In these search we increase k_neighbors to include 7 as an option to investigate if the value should be increased. 

In [None]:
# Utilizing SMOTE oversampling
full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/naive_bayes_smote_gs2.pkl'
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

naive_bayes_smote_df2 = gs_to_clean_df(search_results, sort_by='rank_f1_score')

In [None]:
naive_bayes_smote_df2.head()

Unnamed: 0,alpha,fit_prior,k_neighbors,sampling_strategy,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.5,True,7,0.4,word,1,1,"(1, 2)",english,0.731959,1,0.669495,5,0.987906,18,0.957417,5,0.8287,5,0.965544,10,0.807671,18
1,0.4,True,7,0.4,word,1,1,"(1, 2)",english,0.731679,2,0.683761,1,0.985985,23,0.957914,3,0.834873,1,0.964761,20,0.787194,22
2,0.4,True,1,0.4,word,1,1,"(1, 2)",english,0.729259,3,0.677967,2,0.986288,21,0.957694,4,0.832128,2,0.964636,21,0.78904,21
3,0.5,True,1,0.4,word,1,1,"(1, 2)",english,0.728942,4,0.664142,8,0.98804,17,0.956934,7,0.826091,8,0.965294,13,0.808142,17
4,0.6,True,5,0.4,word,1,1,"(1, 2)",english,0.72782,5,0.654329,9,0.989118,14,0.955938,10,0.821724,9,0.965607,9,0.820351,14


# Multinomial Naive Bayes

Using SMOTE again (tuned version of above). 

Added k_neighbors = 9 to the search space.

In [None]:
# 
full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/naive_bayes_smote_gs3.pkl'
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

naive_bayes_smote_df3 = gs_to_clean_df(search_results, sort_by='rank_f1_score')

In [None]:
naive_bayes_smote_df3.head()

Unnamed: 0,alpha,fit_prior,k_neighbors,sampling_strategy,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.3,True,7,0.4,word,1,1,"(1, 2)",english,0.730814,1,0.698933,1,0.983829,8,0.958613,2,0.841381,1,0.963821,8,0.765867,8
1,0.4,True,9,0.4,word,1,1,"(1, 2)",english,0.729426,2,0.681984,3,0.985783,6,0.958086,3,0.833883,3,0.964448,5,0.78415,6
2,0.3,True,9,0.4,word,1,1,"(1, 2)",english,0.729182,3,0.692684,2,0.984334,7,0.959165,1,0.838509,2,0.963853,7,0.769843,7
3,0.3,True,7,0.3,word,1,1,"(1, 2)",english,0.727925,4,0.635588,5,0.991611,4,0.956847,6,0.8136,5,0.966609,2,0.852028,4
4,0.3,True,9,0.3,word,1,1,"(1, 2)",english,0.72788,5,0.634256,6,0.99178,3,0.957179,5,0.813018,6,0.966672,1,0.854259,3


# Complement Naive Bayes.

### TFIDF Vectorizer.

### Only input features are the tokenized tweet text (no vader).

### Class imbalance not addressed.

In [None]:
full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/comp_bayes_gs1.pkl'
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

comp_naive_bayes_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

In [None]:
comp_naive_bayes_df.head()

Unnamed: 0,alpha,fit_prior,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.8,True,word,1.0,1,"(1, 1)",english,0.630882,1,0.607944,1321,0.975878,101,0.927536,241,0.791911,553,0.950039,101,0.655916,101
1,0.8,False,word,1.0,1,"(1, 1)",english,0.630882,1,0.607944,1321,0.975878,101,0.927536,241,0.791911,553,0.950039,101,0.655916,101
2,0.8,True,word,0.7,1,"(1, 1)",english,0.630882,1,0.607944,1321,0.975878,101,0.927536,241,0.791911,553,0.950039,101,0.655916,101
3,0.8,True,word,0.95,1,"(1, 1)",english,0.630882,1,0.607944,1321,0.975878,101,0.927536,241,0.791911,553,0.950039,101,0.655916,101
4,0.8,False,word,0.7,1,"(1, 1)",english,0.630882,1,0.607944,1321,0.975878,101,0.927536,241,0.791911,553,0.950039,101,0.655916,101


# Complement Naive Bayes.

### TFIDF Vectorizer.

### Inputs include vader positive, negative and compound sentment scores.

### Class imbalance not addressed.

### Note: Same as above with the addition of vader scores features.

### Note: The compound sentiment score was shifted to fall between 0 - 2 (normally the compound score is between -1 and +1). 

In [None]:
full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/comp_bayes_v_gs1.pkl'
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

comp_naive_bayes_v_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

In [None]:
comp_naive_bayes_v_df.head()

Unnamed: 0,alpha,fit_prior,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.8,False,word,1.0,1,"(1, 1)",english,0.574745,1,0.424179,1321,0.996092,111,0.920872,241,0.710135,841,0.955928,1,0.89118,111
1,0.8,True,word,0.8,1,"(1, 1)",english,0.574745,1,0.424179,1321,0.996092,111,0.920872,241,0.710135,841,0.955928,1,0.89118,111
2,0.8,False,word,0.95,1,"(1, 1)",english,0.574745,1,0.424179,1321,0.996092,111,0.920872,241,0.710135,841,0.955928,1,0.89118,111
3,0.8,True,word,0.7,1,"(1, 1)",english,0.574745,1,0.424179,1321,0.996092,111,0.920872,241,0.710135,841,0.955928,1,0.89118,111
4,0.8,True,word,0.95,1,"(1, 1)",english,0.574745,1,0.424179,1321,0.996092,111,0.920872,241,0.710135,841,0.955928,1,0.89118,111


# Complement Naive Bayes.

### TFIDF Vectorizer.

### The only input features are the tokenized tweet text (no vader scores).

### Class imbalance addressed by random oversampling the minority class.

In [None]:
full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/comp_bayes_ros_gs1.pkl'
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

comp_naive_bayes_ros_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

In [None]:
comp_naive_bayes_ros_df.head()

Unnamed: 0,alpha,fit_prior,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.8,True,word,0.7,1,"(1, 2)",english,0.61085,1,0.853259,40,0.928848,2,0.961948,3,0.891054,3,0.92354,1,0.476008,1
1,0.8,False,word,0.8,1,"(1, 2)",english,0.609984,2,0.851475,48,0.928882,1,0.961482,28,0.890178,13,0.923446,2,0.475476,2
2,0.8,False,word,0.9,1,"(1, 2)",english,0.609721,3,0.852813,44,0.928545,6,0.961889,6,0.890679,7,0.923226,3,0.47476,3
3,0.8,False,word,0.7,1,"(1, 2)",english,0.608704,4,0.850137,53,0.92868,3,0.962007,1,0.889409,28,0.923164,4,0.474327,4
4,0.8,True,word,0.9,1,"(1, 2)",english,0.608497,5,0.849685,58,0.928646,4,0.961756,14,0.889165,34,0.923101,5,0.474254,5


# SGD Classifier

### TFIDF Vectorizer

### Inputs include vader positive, negative and compound sentment scores.

### Class imbalance not addressed. 

In [None]:
full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/SGD_v_gs1.pkl'
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

SGD_clf_v_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

In [None]:
# Observations: 
#
# 1) All of the top five models used ngram_range = (1,2). (Could consider expanding flexibility further to allow for also (1,3)).
# 3) All of the top ranking models used the perceptron loss function and the 'word' analyzer type. (Can carry these forward as the only options and reduce future search spaces).
# 4) The optimal value for max_df (when a word should start getting ignored because it is too frequent) is still unclear.
# 5) The optimal value for min_df (when a word should start getting ignored because it does not occur frequent enough) is likely 1.
# 6) The optimal value for the stochiastic gradient decent regularization term (alpha) is also still unclear, but seems likely the default (0.0001) is not the best.

In [None]:
SGD_clf_v_df.head()

Unnamed: 0,alpha,loss,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.0001,perceptron,word,1.0,1,"(1, 2)",,0.696988,1,0.648524,19,0.984132,855,0.946288,2,0.816328,12,0.960564,5,0.759688,280
1,0.001,perceptron,word,0.9,1,"(1, 2)",,0.692771,2,0.624884,45,0.986524,815,0.942896,24,0.805704,22,0.961128,2,0.780095,266
2,0.0001,perceptron,word,0.8,1,"(1, 2)",english,0.691609,3,0.658757,15,0.981201,896,0.945939,5,0.819979,9,0.958559,20,0.747896,293
3,0.001,perceptron,word,0.95,1,"(1, 2)",,0.691287,4,0.647601,21,0.982953,872,0.943202,19,0.815277,13,0.959405,12,0.748016,292
4,0.001,perceptron,word,0.7,1,"(1, 2)",english,0.689822,5,0.667243,12,0.979281,929,0.940731,31,0.823262,7,0.957369,31,0.731564,314


# SGD Classifier

### TFIDF Vectorizer

### Inputs include vader positive, negative and compound sentment scores.
 
### Class imbalance addressed using random over sampling.

### Note: Same as above with the addition of random oversampling on the minority class.

In [None]:
full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/SGD_ros_v_gs1.pkl'
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

SGD_clf_ros_v_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

In [None]:
SGD_clf_ros_v_df.head()
#
# Observations:
# 1) Adding the random oversampler significantly increased the mean f1_score (our metric of interest).
#

Unnamed: 0,alpha,loss,sampling_strategy,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.0001,perceptron,auto,word,0.8,1,"(1, 2)",english,0.702967,1,0.681976,15,0.980561,10,0.948592,8,0.831269,10,0.959593,7,0.72809,13
1,0.0001,perceptron,0.4,word,0.8,1,"(1, 2)",english,0.7016,2,0.692672,9,0.978439,15,0.950883,1,0.835555,6,0.958371,11,0.720537,17
2,0.0005,perceptron,0.4,word,1.0,1,"(1, 2)",english,0.701492,3,0.659666,32,0.983391,6,0.941741,18,0.821529,28,0.960658,3,0.751482,6
3,0.0001,perceptron,0.6,word,0.7,1,"(1, 2)",english,0.700354,4,0.654762,37,0.983728,5,0.946053,13,0.819245,33,0.960626,4,0.757619,4
4,0.0001,perceptron,0.4,word,0.9,1,"(1, 2)",english,0.698542,5,0.646343,41,0.984672,3,0.950214,2,0.815507,39,0.960908,2,0.766239,3


# SGD Classifier

### TFIDF Vectorizer

### Inputs are only the text. (No vader scores).

### Class imbalance addressed using random over sampling.

### Note: Same as the search above with vader scores removed as features.

In [None]:
full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/SGD_ros_nv_gs1.pkl'
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

SGD_clf_ros_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

In [None]:
SGD_clf_ros_df.head()
#
# Observations:
# 1) The model performs better when not using the vader sentiment scores at inputs, will discontinue using these on future models.
#

Unnamed: 0,alpha,loss,sampling_strategy,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.0001,perceptron,auto,word,1.0,1,"(1, 2)",english,0.720375,1,0.660121,18,0.986895,2,0.930869,11,0.823508,16,0.963947,1,0.793966,2
1,0.0001,perceptron,0.6,word,0.7,1,"(1, 2)",english,0.717224,2,0.673523,7,0.984469,9,0.929614,16,0.828996,5,0.962631,6,0.769633,8
2,0.0005,perceptron,0.6,word,0.7,1,"(1, 2)",english,0.716914,3,0.654793,28,0.986996,1,0.927773,21,0.820895,20,0.963665,2,0.794627,1
3,0.0005,perceptron,0.4,word,0.9,1,"(1, 2)",english,0.715681,4,0.654331,29,0.98676,4,0.92804,19,0.820546,22,0.963414,3,0.792379,3
4,0.0001,perceptron,0.6,word,1.0,1,"(1, 2)",english,0.714149,5,0.674857,5,0.983661,12,0.929854,15,0.829259,4,0.961973,8,0.762175,12


# SGD Classifier

### TFIDF Vectorizer

### Inputs are only the text. (No vader scores).

### Class imbalance addressed using random over sampling.

### Note: Tuned version of above.

In [None]:
full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/SGD_ros_nv_gs2.pkl'
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

SGD_clf_ros2_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

In [None]:
SGD_clf_ros2_df.head()

Unnamed: 0,alpha,loss,sampling_strategy,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.0002,perceptron,0.8,word,0.8,1,"(1, 3)",english,0.713142,1,0.695807,5,0.980662,15,0.929069,17,0.838235,3,0.960658,11,0.73233,15
1,0.0002,perceptron,0.8,word,1.0,1,"(1, 3)",english,0.711557,2,0.675302,15,0.982987,7,0.921755,44,0.829145,13,0.961378,4,0.75767,5
2,0.0002,perceptron,0.8,word,0.9,1,"(1, 2)",english,0.711504,3,0.659241,27,0.985278,2,0.929021,18,0.822259,22,0.962381,1,0.774706,2
3,0.0002,perceptron,0.8,word,1.0,1,"(1, 2)",english,0.710137,4,0.675297,16,0.982684,9,0.93206,13,0.82899,14,0.961096,6,0.753416,7
4,0.0001,perceptron,auto,word,0.8,1,"(1, 2)",english,0.709843,5,0.67934,12,0.982179,10,0.933087,6,0.830759,11,0.960908,9,0.749582,9


# SGD Classifier

### TFIDF Vectorizer

### Inputs are only the text. (No vader scores).

### Class imbalance addressed using SMOTE.


In [None]:
full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/SGD_smote_nv_gs1.pkl'
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

SGD_clf_smote_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

In [None]:
SGD_clf_smote_df.head()

Unnamed: 0,alpha,loss,k_neighbors,sampling_strategy,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.0002,perceptron,1,0.4,word,0.95,1,"(1, 2)",english,0.706369,1,0.679735,171,0.981403,1,0.921144,143,0.830569,129,0.960219,1,0.737107,2
1,0.0001,perceptron,5,0.5,word,1.0,1,"(1, 2)",english,0.705579,2,0.678411,181,0.981235,3,0.923495,91,0.829823,136,0.959969,2,0.742526,1
2,0.0002,perceptron,1,0.6,word,0.95,1,"(1, 2)",english,0.699807,3,0.676198,198,0.980595,4,0.927198,31,0.828397,150,0.959217,3,0.727943,4
3,0.0001,perceptron,1,0.4,word,0.9,1,"(1, 2)",english,0.698581,4,0.686455,130,0.978944,14,0.924897,66,0.8327,107,0.958403,10,0.711893,13
4,0.0002,perceptron,7,0.6,word,1.0,1,"(1, 2)",english,0.697683,5,0.674841,217,0.980258,6,0.921396,140,0.827549,161,0.95881,4,0.724302,6


# SGD Classifier

### TFIDF Vectorizer

### Inputs are only the text. (No vader scores).

### Class imbalance addressed using ADASYN.

In [None]:
full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/SGD_adasyn_nv_gs1.pkl'
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

SGD_clf_adasyn_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

In [None]:
SGD_clf_adasyn_df.head()

Unnamed: 0,alpha,loss,n_neighbors,ratio,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.0002,perceptron,7,minority,word,0.9,1,"(1, 2)",english,0.697685,1,0.673968,58,0.98046,4,0.926264,16,0.827214,38,0.958935,2,0.724241,3
1,0.0001,perceptron,5,minority,word,0.9,1,"(1, 2)",english,0.697048,2,0.661922,84,0.981976,1,0.926559,13,0.821949,51,0.959499,1,0.739726,1
2,0.0001,perceptron,1,minority,word,0.95,1,"(1, 2)",english,0.695248,3,0.682893,40,0.978709,7,0.927507,10,0.830801,31,0.957933,5,0.709849,6
3,0.0002,perceptron,7,minority,word,1.0,1,"(1, 2)",english,0.691582,4,0.67439,57,0.979112,5,0.929384,2,0.826751,41,0.957713,6,0.710456,5
4,0.0001,perceptron,1,minority,word,1.0,1,"(1, 2)",english,0.690961,5,0.674396,56,0.979011,6,0.927042,11,0.826704,42,0.957619,7,0.708883,7


# Gradient Boosted Forest

### TFIDF Vectorizer

### No VADER scores

### No oversampling

In [None]:
full_path = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/bf_gs1.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

boosted_forest_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

boosted_forest_df.head()

Unnamed: 0,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.8,0.05,-1,1200,0.7,word,1.0,1,"(1, 1)",english,0.591717,1,0.480827,1,0.989118,1141,0.913219,233,0.734973,1,0.953422,33,0.770337,1129
1,0.8,0.05,-1,1200,1.0,word,1.0,1,"(1, 1)",english,0.591717,1,0.480827,1,0.989118,1141,0.913219,233,0.734973,1,0.953422,33,0.770337,1129
2,0.8,0.05,-1,1200,0.7,word,0.9,1,"(1, 1)",english,0.591717,1,0.480827,1,0.989118,1141,0.913219,233,0.734973,1,0.953422,33,0.770337,1129
3,0.8,0.05,-1,1200,1.0,word,0.9,1,"(1, 1)",english,0.591717,1,0.480827,1,0.989118,1141,0.913219,233,0.734973,1,0.953422,33,0.770337,1129
4,1.0,0.05,-1,1200,1.0,word,0.9,5,"(1, 1)",english,0.59001,5,0.478598,9,0.989152,1133,0.913186,241,0.733875,5,0.953297,53,0.769316,1137


# Gradient Boosted Forest

### TFIDF Vectorizer

### No VADER scores

### Random Over Sampler

In [None]:

full_path = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/bf_ros_gs1.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

boosted_forest_ros_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

boosted_forest_ros_df.head()


Unnamed: 0,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,sampling_strategy,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,0.8,0.06,-1,1250,0.7,0.6,word,0.95,1,"(1, 1)",english,0.679107,1,0.678416,135,0.975777,69,0.929166,184,0.827097,126,0.954894,4,0.681052,35
1,0.8,0.04,-1,1250,0.7,0.6,word,0.95,1,"(1, 1)",english,0.675838,2,0.68913,85,0.973554,128,0.932954,46,0.831342,72,0.953579,84,0.663737,115
2,0.75,0.06,-1,1250,0.65,0.6,word,0.95,1,"(1, 1)",english,0.675547,3,0.674408,152,0.975643,77,0.928689,197,0.825026,148,0.954487,13,0.677253,56
3,0.75,0.04,-1,1200,0.75,0.4,word,0.95,1,"(1, 1)",english,0.675021,4,0.665932,164,0.976788,28,0.930769,131,0.82136,163,0.954957,3,0.685365,9
4,0.85,0.05,-1,1150,0.7,0.6,word,0.95,1,"(1, 1)",english,0.674914,5,0.685109,107,0.973925,115,0.931319,110,0.829517,100,0.953641,76,0.665698,106


# Extra Random Forest

In [None]:
full_path = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/erf_gs1.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

extra_random_forest_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

extra_random_forest_df.head()

Unnamed: 0,class_weight,max_depth,min_samples_leaf,min_samples_split,n_estimators,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,"{0: 1, 1: 5}",9,2,3,200,word,1.0,5,"(1, 2)",english,0.168804,1,0.09233,1,1.0,1,0.900651,311,0.546165,1,0.936257,1,1.0,1
1,"{0: 1, 1: 5}",9,1,2,600,word,0.9,5,"(1, 2)",english,0.168796,2,0.092329,2,1.0,1,0.916191,113,0.546165,2,0.936257,1,1.0,1
2,"{0: 1, 1: 5}",9,1,3,100,word,1.0,5,"(1, 2)",english,0.168796,2,0.092329,2,1.0,1,0.886311,449,0.546165,2,0.936257,1,1.0,1
3,"{0: 1, 1: 5}",9,2,2,300,word,1.0,5,"(1, 2)",english,0.168796,2,0.092329,2,1.0,1,0.907569,222,0.546165,2,0.936257,1,1.0,1
4,"{0: 1, 1: 5}",9,1,3,1000,word,0.9,5,"(1, 2)",english,0.168063,5,0.091883,5,1.0,1,0.916151,115,0.545941,5,0.936226,5,1.0,1


# Logistic Regression

In [None]:
# Initial logistic regression models

full_path = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/lr_gs1.pkl"
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

lr_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

lr_df.head()

Unnamed: 0,C,class_weight,fit_intercept,penalty,solver,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,100.0,"{0: 1, 1: 5}",True,l2,liblinear,word,0.9,1,"(1, 2)",english,0.730468,1,0.669947,368,0.987569,802,0.958404,33,0.828758,284,0.965262,13,0.803116,570
1,100.0,"{0: 1, 1: 5}",True,l2,liblinear,word,1.0,1,"(1, 2)",english,0.730468,1,0.669947,368,0.987569,802,0.958404,33,0.828758,284,0.965262,13,0.803116,570
2,68.1292,"{0: 1, 1: 5}",True,l2,liblinear,word,1.0,1,"(1, 2)",english,0.730318,3,0.669947,368,0.987535,805,0.958556,29,0.828741,286,0.965231,15,0.802802,573
3,68.1292,"{0: 1, 1: 5}",True,l2,liblinear,word,0.9,1,"(1, 2)",english,0.730318,3,0.669947,368,0.987535,805,0.958556,29,0.828741,286,0.965231,15,0.802802,573
4,46.4159,"{0: 1, 1: 5}",True,l2,liblinear,word,1.0,1,"(1, 2)",english,0.730316,5,0.67173,354,0.987299,813,0.9587,25,0.829514,267,0.965137,25,0.800283,579


# Logistic Regression

## Random Over Sampler

In [None]:
# with random over sampling

full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/lr_ros_gs1.pkl'
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

lr_ros_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

lr_ros_df.head()

Unnamed: 0,C,class_weight,fit_intercept,penalty,solver,sampling_strategy,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,100.0,,True,l2,liblinear,0.4,word,1.0,1,"(1, 2)",english,0.732832,1,0.671283,172,0.987838,1,0.958439,32,0.829561,163,0.965607,1,0.806942,1
1,21.5443,,True,l2,liblinear,0.6,word,0.8,1,"(1, 2)",english,0.732597,2,0.68689,129,0.985749,22,0.958776,17,0.836319,123,0.964761,9,0.785186,16
2,10.0,,True,l2,liblinear,0.6,word,0.8,1,"(1, 2)",english,0.732301,3,0.693583,118,0.984806,32,0.958882,8,0.839195,111,0.964354,17,0.776176,28
3,68.1292,,True,l2,liblinear,0.4,word,1.0,1,"(1, 2)",english,0.730926,4,0.670838,175,0.987535,3,0.958181,47,0.829186,167,0.965294,2,0.802979,3
4,68.1292,,True,l2,liblinear,0.5,word,0.8,1,"(1, 2)",english,0.730891,5,0.676637,158,0.98676,10,0.958513,25,0.831698,149,0.96498,5,0.794874,8


## Logistic Regression

## Over sampling with SMOTE

In [None]:
full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/lr_smote_gs1.pkl'
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

lr_smote_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

lr_smote_df.head()

Unnamed: 0,C,class_weight,fit_intercept,penalty,solver,k_neighbors,sampling_strategy,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,68.1292,"{0: 1, 1: 2}",True,l2,liblinear,7,0.6,word,0.8,1,"(1, 2)",english,0.721965,1,0.702949,316,0.981505,245,0.957217,198,0.842227,306,0.961942,180,0.742808,234
1,21.5443,"{0: 1, 1: 2}",True,l2,liblinear,5,0.6,word,1.0,1,"(1, 2)",english,0.72163,2,0.711419,242,0.980292,290,0.957287,158,0.845855,227,0.96141,244,0.732947,278
2,68.1292,"{0: 1, 1: 2}",True,l2,liblinear,5,0.6,word,1.0,1,"(1, 2)",english,0.721503,3,0.70072,328,0.981707,240,0.95716,240,0.841213,323,0.961973,175,0.744244,222
3,68.1292,"{0: 1, 1: 5}",True,l2,liblinear,7,0.3,word,0.8,1,"(1, 2)",english,0.721485,4,0.705177,294,0.9811,254,0.957414,91,0.843139,284,0.961723,209,0.739394,241
4,68.1292,"{0: 1, 1: 5}",True,l2,liblinear,3,0.3,word,1.0,1,"(1, 2)",english,0.721479,5,0.706515,287,0.980932,259,0.957221,196,0.843724,276,0.96166,215,0.737779,249


# Multilayer Perceptron

did not converge, needs further investigation. Adaptive learning rate may have had a initial value too low, or adaptive learning just may not be a good choice for this data set.

In [None]:
# MLP with adapative learning rate. 
# No predicted samples. This ran for 24 hours, I wonder if SGD didn't converge? stuck in a local minima? 
# adaptive learning only decreases learning 

full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/mlp_gs1.pkl'
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

mlp_adaptive_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

In [None]:
mlp_adaptive_df.head()

Unnamed: 0,activation,alpha,hidden_layer_sizes,learning_rate,max_iter,solver,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,relu,0.0001,"(100,)",adaptive,500,sgd,word,1,1,"(1, 2)",english,0.0,1,0.0,1,1.0,1,0.8307,3,0.5,1,0.929773,1,0.0,1
1,relu,0.0001,"(200,)",adaptive,500,sgd,word,1,1,"(1, 2)",english,0.0,1,0.0,1,1.0,1,0.827691,4,0.5,1,0.929773,1,0.0,1
2,relu,0.0005,"(100,)",adaptive,500,sgd,word,1,1,"(1, 2)",english,0.0,1,0.0,1,1.0,1,0.834988,1,0.5,1,0.929773,1,0.0,1
3,relu,0.0005,"(200,)",adaptive,500,sgd,word,1,1,"(1, 2)",english,0.0,1,0.0,1,1.0,1,0.830979,2,0.5,1,0.929773,1,0.0,1


## Multilayer Perceptron

In [None]:
# MLP with constant learning rate

full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/mlp_gs1_sm.pkl'
with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

mlp_constant_learning_rate_df = gs_to_clean_df(search_results, sort_by='rank_f1_score')

In [None]:
mlp_constant_learning_rate_df.head()

Unnamed: 0,activation,alpha,hidden_layer_sizes,Tfidf_Vect_analyzer,Tfidf_Vect_max_df,Tfidf_Vect_min_df,Tfidf_Vect_ngram_range,Tfidf_Vect_stop_words,mean_f1_score,rank_f1_score,mean_sensitivity,rank_sensitivity,mean_specificity,rank_specificity,mean_AUC_ROC,rank_AUC_ROC,mean_ROC_AUC_Score,rank_ROC_AUC_Score,mean_accuracy,rank_accuracy,mean_precision,rank_precision
0,relu,0.0005,"(100,)",word,1,1,"(1, 2)",english,0.734613,1,0.650318,2,0.990938,3,0.95679,1,0.820628,2,0.967016,2,0.844406,3
1,relu,0.0005,"(200,)",word,1,1,"(1, 2)",english,0.73442,2,0.654784,1,0.990298,4,0.956091,2,0.822541,1,0.966735,3,0.836662,4
2,relu,0.0001,"(200,)",word,1,1,"(1, 2)",english,0.729499,3,0.631587,3,0.992454,2,0.955335,3,0.81202,3,0.96711,1,0.863533,2
3,relu,0.0001,"(100,)",word,1,1,"(1, 2)",english,0.724437,4,0.622669,4,0.992723,1,0.95514,4,0.807696,4,0.966735,3,0.866192,1
