<a href="https://colab.research.google.com/github/BradenAnderson/Twitter-Sentiment-Analysis/blob/main/04_Generate_Soft_Voting_Ensemble_Predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## This notebook is used to generate soft voting ensembles for all combinations of the best models from the following types

### 1. Naive Bayes
### 2. Gradient Boosted Forest
### 3. Multilayer Perceptron
### 4. Logistic Regression

All together 11 models are built and tested. The test results for each are output to their own csv files.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pickle
import pandas as pd
import numpy as np

import itertools as it

from sklearn.ensemble import VotingClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB



In [3]:
# Load the training data.
filepath= "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/vader_full_preprocessing_model_droppedlt3.csv"

train_tweet_df = pd.read_csv(filepath)

train_tweet_df = train_tweet_df.loc[:, ['label', 'Clean_Tweet']]

train_tweet_df.head()

Unnamed: 0,label,Clean_Tweet
0,0,father dysfunctional significant selfish pron ...
1,0,thank #lyft credit use cause pron offer wheelc...
2,0,bihday pron majesty
3,0,#model love pron pron time pron happy love hap...
4,0,factsguide society #motivation


In [4]:
# Load the previously unseen test data.
test_tweet_filepath = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/train_test_data/test_tweets_clean.csv"

test_tweet_df = pd.read_csv(test_tweet_filepath)

test_tweet_df = test_tweet_df.loc[: , ['id', 'Clean_Tweet']]

test_tweet_df.head()

Unnamed: 0,id,Clean_Tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,#white #supremacists want new #birds #movie
2,31965,safe way heal pron #acne #altwaystoheal #healt...
3,31966,hp curse child book reservation yes happy love...
4,31967,3rd #bihday pron amazing hilarious #nephew eli...


In [5]:
# Grab the training data.
X_train = train_tweet_df.loc[:, ['Clean_Tweet']]
y_train = train_tweet_df.loc[:, 'label'].to_numpy().ravel()

# Grab the previously unseen test data.
X_test = test_tweet_df.loc[:, ['Clean_Tweet']]

In [6]:
# Locations where I saved the pickle files for the best performing models.
best_nb_filename = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/naive_bayes_ros_gs2.pkl"
best_gbf_filename = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/bf_ros_gs1.pkl"
best_lr_filename = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/lr_ros_gs1.pkl"
best_mlp_filename = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/pickle_gridsearch/mlp_gs1_sm.pkl"

In [7]:
# Load the .pkl gridsearch result files for the best performing model of each type.
with open(best_nb_filename, 'rb') as file:
  best_nb_model = pickle.load(file)

with open(best_lr_filename, 'rb') as file:
  best_lr_model = pickle.load(file)

with open(best_gbf_filename, 'rb') as file:
  best_gbf_model = pickle.load(file)

with open(best_mlp_filename, 'rb') as file: 
  best_mlp_model = pickle.load(file)



In [8]:
# The top estimators for each model type.
best_estimators = [('nb', best_nb_model.best_estimator_),
                   ('lr', best_lr_model.best_estimator_),
                   ('gbf', best_gbf_model.best_estimator_),
                   ('mlp', best_mlp_model.best_estimator_)]

In [9]:
# This function automates creating soft voting ensembles for the top models from the following categories:
# Naive Bayes, Logistic Regression, Gradient Boosted Random Forest, and Multilayer Perceptron.
# The predictions made by each ensemble are saved in a .csv format with a descriptive file name.
#
def ensemble_all_combinations_of_best_models(best_model_list, X_train, y_train, X_test, test_tweet_df): 

  base_filename = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/04_generate_predictions/automated_predictions/soft_voting/soft_"

  num_best_models = len(best_model_list)

  combination_sizes = list(range(1, num_best_models + 1))

  for size in combination_sizes:

    print("---------creating ensembles of size ", size, "---------")

    # All combinations of models of a given size
    best_estimator_combinations = it.combinations(best_model_list, size)

    for combo_num, combination in enumerate(best_estimator_combinations):

      models = []

      specific_filename = base_filename + "ensembleSize_" + str(size) + "_" + "combo_" + str(combo_num + 1) + "_"

      # Build the list containing this combination of models. Finish building the specific file name.
      for model_num, model in enumerate(combination):

        models.append(model) 

        model_name = model[0]

        specific_filename = specific_filename + str(model_name) + "_"

      # Create the ensemble classifier using this combination of best models.
      ensemble_vote = VotingClassifier(estimators = models, voting='soft')

      print("fitting: ", specific_filename)
      # Fit the ensemble on the training data
      ensemble_vote.fit(X_train, y_train)
      print("fit complete!\n")

      print("Making predictions.")
      # Use the ensemble to make predictions on the test data.
      predictions = ensemble_vote.predict(X_test)
      
      # Save the predictions in a data frame.
      predictions_df = test_tweet_df.loc[: , ['id']].copy(deep=True)

      print("Finishing predictions dataframe.")
      predictions_df['label'] = predictions

      # Finish the filename by adding the .csv extension
      specific_filename = specific_filename + ".csv"

      print("Saving predictions file to:", specific_filename)
      # Save the predictions made by this ensemble out to a .csv file.
      predictions_df.to_csv(path_or_buf=specific_filename, index=False)
      print("")

      # Make sure the file name is reset for the next round.
      specific_filename = base_filename 

  return

In [None]:
# Calling this function will take all combinations of the best models that are capable of soft voting ensembling (MLP, LogReg, Naive Bayes, and Boosted Forest)
# and will fit and then test these ensembles on unseen data. The test results for each combination are output to a .csv file with a descriptive name. 
ensemble_all_combinations_of_best_models(best_estimators, X_train, y_train, X_test, test_tweet_df)