<a href="https://colab.research.google.com/github/BradenAnderson/Twitter-Sentiment-Analysis/blob/main/05_fastText.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Clone the fastText github repository
! git clone https://github.com/facebookresearch/fastText.git

Cloning into 'fastText'...
remote: Enumerating objects: 3854, done.[K
remote: Total 3854 (delta 0), reused 0 (delta 0), pack-reused 3854[K
Receiving objects: 100% (3854/3854), 8.22 MiB | 15.17 MiB/s, done.
Resolving deltas: 100% (2417/2417), done.


In [None]:
# Install the in a local folder in my google drive.
! pip install /content/fastText

Processing ./fastText
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3085775 sha256=9cfddf039f0cdb76c7b7d9f19b3aa605e1c3e3e75792e4f00082d7cc061b7b06
  Stored in directory: /tmp/pip-ephem-wheel-cache-0prnoz44/wheels/a1/9f/52/696ce6c5c46325e840c76614ee5051458c0df10306987e7443
Successfully built fasttext
Installing collected packages: fasttext
Successfully installed fasttext-0.9.2


In [None]:
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import fasttext
from fasttext.FastText import load_model
from lightgbm import LGBMClassifier

from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split, cross_validate, cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, SCORERS, multilabel_confusion_matrix, make_scorer, roc_curve, roc_auc_score, f1_score

pd.set_option('display.max_rows', 1000)

In [None]:
# Read in the tweet data set.
filepath= "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/vader_full_preprocessing_model.csv"

tweet_df = pd.read_csv(filepath)

# Make a copy of the tweet dataset that only includes clean tweets and the target labels.
fastText_df = tweet_df.loc[:, ['label', 'Clean_Tweet']].copy(deep=True)

ft_df = fastText_df.copy(deep=True)

fastText_df.head()

Unnamed: 0,label,Clean_Tweet
0,0,father dysfunctional significant selfish pron ...
1,0,thank #lyft credit use cause pron offer wheelc...
2,0,bihday pron majesty
3,0,#model love pron pron time pron happy love hap...
4,0,factsguide society #motivation


# Setting up train and test files for supervised learning with fastText

In [None]:
# Filepath to where I am saving the fastText formatted supervised learning file with all of the tweet data.
all_data_filepath = r'/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/fastText_Models/all_data.txt'

In [None]:
# Save locations where we will store the training and testing data files once they are created.
# If someone else ever uses this notebook, this should help make it easier so they have less links to change.

train_data_filepath = r'/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/fastText_Models/train_data.txt'
test_data_filepath = r'/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/fastText_Models/test_data.txt'

In [None]:
# Function used to format labels the way fastText needs them for supervised learning. 
def format_label(current_label):
  prefix = "__label__"
  formatted_label = prefix + str(current_label)
  return formatted_label 

In [None]:
# Change the label column from simple 0's and 1's to __label__0 or __label__1
fastText_df['fastText_Label'] = fastText_df['label'].apply(format_label)

# make the label and tweet text one big string (e.g. __label__0tweet_text_here) - this is how fastText needs it for training.
fastText_df['labeled_tweets'] = fastText_df['fastText_Label'] + fastText_df['Clean_Tweet']

In [None]:
# Save a file containing all of the data. We will need this to retrain our final model once the optimal hyperparameters are found. 
np.savetxt(all_data_filepath, fastText_df['labeled_tweets'], fmt='%s')

In [None]:
# Create a copy of the dataframe with the correctly formatted column.
fastText_formatted_df = fastText_df.copy(deep=True)

In [None]:
# Remove all the columns that fastText won't need.
fastText_formatted_df.drop(columns=['label', 'Clean_Tweet', 'fastText_Label'], inplace=True)

In [None]:
# Use scikit-learns train_test_split to split the properly formatted labeled data into a training and testing set.
X = fastText_df['labeled_tweets'].to_numpy()
y = fastText_df['label'].to_numpy()

# Note: Since fastText trains on data where the label and data are one big string, the X_train and X_test are all we need for training
# and testing respectively. The y_train and y_test are just going to be thrown away.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)

In [None]:
# Save X_train and X_test into training and testing data frames.
training_df = pd.DataFrame(X_train, columns=['labeled_tweets'])
testing_df = pd.DataFrame(X_test, columns=['labeled_tweets'])

# Write the contents of the training and testing dataframes out to a text file, which can later be read by fastText.
np.savetxt(train_data_filepath, training_df['labeled_tweets'], fmt='%s')
np.savetxt(test_data_filepath, testing_df['labeled_tweets'], fmt='%s')

# Explore using fastText as a classifier (supervised learning).

In [None]:
# The fastText test function returns the precision and recall classification metrics. 
# This function uses precision and recall to calculate the F1-score. 
def calculate_f1_score(model_results):
  num_samples, precision, recall = model_results
  f1_score = 2 * ( (precision  * recall)/(precision + recall))
  return f1_score

In [None]:
# Location where we will save the trained model.
first_model_filepath = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/fastText_Models/ft_first_model.bin"

# Commented out because the trained model has been saved, therefore training does not need to be repeated each time. (Commented out because training has been accomplished).
# first_model = fasttext.train_supervised(input=train_data_filepath, epoch=25)

# Save the model so we don't have to retrain the next time we run this notebook. (Commented out because training/saving has been accomplished).
# first_model.save_model(first_model_filepath)

# Load the trained model.
first_model = load_model(first_model_filepath)

# Test the model on the training data file.
first_model_test_result = first_model.test(test_data_filepath)

first_model_test_result

(6184, 0.30206985769728334, 0.30206985769728334)

In [None]:
# Calculate the f1 score for the first model.
first_model_f1 = calculate_f1_score(first_model_test_result)
first_model_f1

0.30206985769728334

In [None]:
# Creating a second model, limit the training time to 10 minutes (1200 seconds), and use the autotuneValidate parameter to automatically tune the models
# hyperparameters using the test file.

# Location to save the trained model.
second_model_filepath = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/fastText_Models/autoTuned_model.bin"

# Training the model and auto tuning hyperparameters. (Commented out because training has been accomplished).
# tuned_model = fasttext.train_supervised(input=train_data_filepath, autotuneValidationFile=test_data_filepath, autotuneDuration=1200)

# Save the trained model so we don't have to repeat training next time we run the notebook. (Commented out because training/saving has been accomplished).
# tuned_model.save_model(second_model_filepath)

# Loading the trained model.
tuned_model = load_model(second_model_filepath)

# Testing the model on the test .txt file.
tuned_model_test_results = tuned_model.test(test_data_filepath)

tuned_model_test_results

(6160, 0.1775974025974026, 0.1775974025974026)

In [None]:
# Calculate the f1 score for the auto tuned model.
# Note: Limiting the autotune time to 10 minutes decreased performance as compared to the manually selected hyperparameters used by our first model.
tuned_model_f1 = calculate_f1_score(tuned_model_test_results)
tuned_model_f1

0.1775974025974026

In [None]:
# Creating another model, increasing the training time by a lot (30k seconds = 8.333 hours) and again autoTuning the hyper parameters.

# Location to save the trained model.
long_train_filepath = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/fastText_Models/autoTuned_longTrain_model.bin"

# Training the model.(Commented out because training has been accomplished).
# long_tune_model = fasttext.train_supervised(input=train_data_filepath, autotuneValidationFile=test_data_filepath, autotuneDuration=30000)

# Save the trained model so we don't have to repeat training next time we run the notebook. (Commented out because training/saving has been accomplished).
# long_tune_model.save_model(long_train_filepath)

# Load the trained model.
long_tune_model = load_model(long_train_filepath)

# Test the model using the test set.
long_tune_test_results = long_tune_model.test(test_data_filepath)

# Letting fastText autoTune the hyperparameters for 8+ hours greatly increased our classification metrics. 
long_tune_test_results

(6184, 0.8279430789133247, 0.8279430789133247)

In [None]:
# Creating another model, increasing the training even more (82.8k seconds = 23 hours) and again autoTuning the hyper parameters.

# Location to save the trained model.
very_long_train_filepath = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/fastText_Models/23hr_model.bin"

# Training the model. (commented out because training has already been accomplished).
# very_long_model = fasttext.train_supervised(input=train_data_filepath, autotuneValidationFile=test_data_filepath, autotuneDuration=82800)

# Save the trained model so we don't have to repeat training next time we run the notebook. (commented out because saving has already been accomplished).
# very_long_model.save_model(very_long_train_filepath)

# Load the trained model.
long_model = load_model(very_long_train_filepath)

# Test the model using the test set.
very_long_trained_model_test_results = long_model.test(test_data_filepath)

# Allowing the additional training time (23 hours compared to 8.33) did result in further improvements to the precision and recall.
very_long_trained_model_test_results

(6176, 0.8492551813471503, 0.8492551813471503)

# Explore using fastText to create vector representations of words (unsupervised learning).

Note: The word vector representations created by fastText can be used as inputs to a supervised learning classifier, I will explore this concept in the next section. 

In [None]:
# Location to save the text file that is properly formatted for fastText unsupervised learning (creating word vector representations).
unlabeled_tweet_filepath = r'/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/fastText_Models/unlabeled_data.txt'

# Location to save the file of learned word vector representations
word_vector_model_filepath = r"/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/fastText_Models/unsupervised_model.bin"

In [None]:
# Dataframe that has a column of the clean (unlabeled) tweet text.
ft_df.head()

Unnamed: 0,label,Clean_Tweet
0,0,father dysfunctional significant selfish pron ...
1,0,thank #lyft credit use cause pron offer wheelc...
2,0,bihday pron majesty
3,0,#model love pron pron time pron happy love hap...
4,0,factsguide society #motivation


In [None]:
# Save the unlabeled tweet data to a text file that fastText can read and create word vectors from.
np.savetxt(unlabeled_tweet_filepath, ft_df['Clean_Tweet'], fmt='%s')

In [None]:
# Use a skipgram model to create word vector representations of every word in the tweets.
# Note: This does not need to be commented out because it runs fast! (i.e. fastText :) ) 
unsupervised_model = fasttext.train_unsupervised(unlabeled_tweet_filepath, model='skipgram')

# Save the learned word vector representations.
unsupervised_model.save_model(word_vector_model_filepath)

In [None]:
# Print a list of words that fastText learned word vectors for.
print(unsupervised_model.words)

['</s>', 'pron', 'happy', 'love', 'day', 'significant', 'amp', '#love', 'good', 'time', 'sad', 'like', 'today', 'flirt', 'new', 'joy', '#positive', 'want', 'people', 'life', 'thank', 'come', 'look', 'need', 'father', 'wait', 'feel', 'work', '2', 'bihday', '#smile', '#healthy', 'week', 'know', 'think', 'find', 'cool', 'great', 'year', 'bull', '#thankful', 'confident', 'thing', 'summer', '#fun', 'thankful', 'watch', 'tomorrow', '3', '#life', 'friend', 'right', 'live', 'funny', 'world', 'morning', '1', '#i', '#summer', 'night', '#model', '#cute', '#affirmation', 'excitement', 'weekend', '#blog', 'girl', 'man', '#fathersday', 'play', '#me', '4', 'let', 'leave', 'angry', 'home', 'oh', 'finally', 'celebrate', 'family', 'silly', 'game', 'god', 'tonight', '#gold', 'gt', '#silver', 'friday', '#altwaystoheal', 'old', 'dad', '#beautiful', 'guy', 'ready', 'bear', '#bihday', '#family', 'power', '#music', 'joke', 'sta', 'little', 'other', '#selfie', '#friends', 'wish', '#weekend', 'try', '#forex', '

In [None]:
# Print the word vector (100-dimensional) representation of a particular word.
print(unsupervised_model['love'])

[-0.30345777 -0.02664436 -0.48690012 -0.12322477  0.20298533  0.6469242
 -0.12125882 -0.06383161  0.32793236  0.44392973  0.08475346  0.2642347
  0.31127265  0.3674754  -0.5968394  -0.23701833  0.2754445   0.28434968
 -0.11251639 -0.30411988  0.07977807  0.04600069  0.42844358  0.01662834
  0.31693837 -0.43812928  0.22705     0.11845665 -0.4396209   0.59830695
  0.01588855  0.05112264 -0.8107919  -0.10927011 -0.40397915 -0.330086
 -0.4688317  -0.8980728  -0.0783285   0.34643972  0.20025347  0.6191598
  0.5537351   0.40460432 -0.17218041  0.14738113 -0.13200775  0.20507558
 -0.37343642  0.16191173 -0.07221972 -0.07308648  0.09036505 -0.09267928
 -0.16544695  0.04552644  0.02728527  0.5201551   0.18873724 -0.00295098
 -0.01188896  0.42409924  0.6710028  -0.07485198 -0.01827253  0.05718816
 -0.37601697  0.06879427  0.15553632 -0.2989822  -0.6240012  -0.38962668
 -0.19891742 -0.06294543  0.2601991   0.00143086 -0.03951265 -0.12503795
  0.5739185  -0.47947788 -0.14917994  0.12212523  0.2419

In [None]:
# Find the word vectors that are "closest" to a given word.
print(list(unsupervised_model.get_nearest_neighbors('Trump')))

[(0.980888307094574, 'trump'), (0.9533953070640564, 'obamas'), (0.9478312730789185, 'obama'), (0.9466777443885803, '#neverump'), (0.9454813599586487, 'paladino'), (0.9449848532676697, 'fascist'), (0.9396200776100159, 'republican'), (0.9323604702949524, 'republicans'), (0.9296003580093384, 'campaign'), (0.9294282793998718, '#dumptrump')]


# Explore using fastText in a Scikit-Learn pipeline

In [None]:
# Read in a csv file that contains a string with every unqiue word found in the set of tweets.
unique_df = pd.read_csv("/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/unique_words.csv")

# Grab the long string of unqiue words from the dataframe.
unique_words = unique_df.loc[unique_df.index == 0, 'Unique_Words'].to_numpy()[0]

# Split the string at white spaces to get a list of unique words.
unique_words = unique_words.split(" ")

In [None]:
# Since fastText generates 100-dimensional vectors for each word, we may not be able to use each words entire vector representation when training supervised models.
# The goal is to come up with a single vector that represents the entire tweet. One option would be to represent the tweet vector as the mean of the word vectors
# for all the words it contains. 

# The MeanEmbeddingVectorizer class below can be used in a scikit-learn pipeline. When applied to a set of tweet strings, this class will take tweets one at a time and
# use a dictionary that maps each unique word to its learned fastText word vector representation to create word vectors for each word in the tweet. Then, all word vectors
# for the tweet are averaged to create a single "tweet vector", which is then returned.
 
# Note: This class was inspired by a book written by Joydeep Bhattacharjee. I made some custom modifications to the class, specifically expanding out the code to make it
# easier to see what operations are being performed.

# For more context, reference his book, which can be found here: https://subscription.packtpub.com/book/big_data_and_business_intelligence/9781789130997

class MeanEmbeddingVectorizer(object):

    def __init__(self, ft_wv):
        self.ft_wv = ft_wv
        if len(ft_wv)>0:
            self.dim = ft_wv[next(iter(unique_words))].shape[0] 
        else:
            self.dim=0
            
    def fit(self, X, y):
        return self 

    def transform(self, X):

      mean_of_word_vectors = []

      # For each tweet in our training set (that we have converted to a list of clean word tokens).
      for clean_tweet_token_list in X: 
        
        # Create a list to hold the FastText word vectors for each word in the tweet.
        this_tweet_word_vectors = []

        # For every word in a particular tweet.
        for word in clean_tweet_token_list:
          
          # Initialize the word vector for this word to be a 100-dimension vector of all zeros.
          this_words_vector = np.zeros(self.dim)
          
          # If we have a valid word vector for this word, use the valid word vector instead of the zero vector.
          if word in self.ft_wv: 
            this_words_vector = self.ft_wv[word]
          
          # Append this word vector to the list of word vectors that make up this tweet.
          this_tweet_word_vectors.append(this_words_vector)

        # Create a single 100 dimensional vector that is the (element wise) mean of all word vectors that make up this tweet.
        mean_vector = np.mean(this_tweet_word_vectors, axis=0)

        # Append this mean vector to the list of mean vectors (this list will have one mean vector for every tweet in our dataset).
        mean_of_word_vectors.append(mean_vector)
      
      return np.array(mean_of_word_vectors)

In [None]:
# Load the model of word vector representations that was trained in the previous section.
ft_model = load_model(word_vector_model_filepath)

# Create a dictionary mapping each unique word to its fastText word vector.
word_vector_dictionary = {word : ft_model.get_word_vector(word) for word in unique_words}

In [None]:
# Create the feature and target vectors.
X = ft_df['Clean_Tweet'].to_numpy()
y = ft_df['label'].to_numpy()

In [None]:
# Create a pipeline using our fastText mean embedding vectorizer and a stochastic gradient decent classifier.
sgd_model_pipeline = Pipeline([("ft_word_vectorizer", MeanEmbeddingVectorizer(word_vector_dictionary)),
                               ("SGD_Clf", SGDClassifier())])

# Use cross validation to calculate the f1_score of the pipeline above.
sgd_clf_score = cross_validate(sgd_model_pipeline, X, y, scoring='f1_macro')

sgd_clf_score

{'fit_time': array([3.03938246, 4.57707596, 4.3450129 , 3.01535845, 3.03911781]),
 'score_time': array([0.76280284, 1.0842104 , 0.70577145, 0.72756815, 0.73754811]),
 'test_score': array([0.48182998, 0.48182998, 0.48182998, 0.48178795, 0.48178795])}

In [None]:
# Create a pipeline using our fastText mean embedding vectorizer and a stochastic extra random forest classifier.
extra_tree_clf_pipeline = Pipeline([("ft_word_vectorizer", MeanEmbeddingVectorizer(word_vector_dictionary)),
                                    ("Extra_Tree_Clf", ExtraTreesClassifier(n_estimators=200))])

# Use cross validation to calculate the f1_score of the pipeline above.
extraTree_clf_score = cross_validate(extra_tree_clf_pipeline, X, y, scoring='f1_macro')

extraTree_clf_score

{'fit_time': array([11.76157546, 12.0076077 , 11.87757182, 11.78357434, 11.85364008]),
 'score_time': array([1.05904984, 1.02256656, 1.02594709, 1.04065871, 1.02431607]),
 'test_score': array([0.68917414, 0.6694044 , 0.67584249, 0.67206223, 0.68398458])}

In [None]:
# Create a pipeline using our fastText mean embedding vectorizer and a multi-layer perceptron classifier.
mlp_pipeline = Pipeline([("ft_word_vectorizer", MeanEmbeddingVectorizer(word_vector_dictionary)),
                         ("mlp_clf", MLPClassifier())])

# Use cross validation to calculate the f1_score of the pipeline above.
mlp_clf_score = cross_validate(mlp_pipeline, X, y, scoring='f1_macro')

mlp_clf_score



{'fit_time': array([ 63.17506194, 151.14886594, 101.47165012, 108.33660197,
         58.32573652]),
 'score_time': array([0.95613837, 1.05994725, 1.05627894, 1.05085969, 1.02306342]),
 'test_score': array([0.48182998, 0.52879689, 0.51821786, 0.48178795, 0.48178795])}

In [None]:
# Create a pipeline using our fastText mean embedding vectorizer and a multi-layer perceptron classifier.
boost_pipeline = Pipeline([("ft_word_vectorizer", MeanEmbeddingVectorizer(word_vector_dictionary)),
                           ("lgbm_clf", LGBMClassifier(n_estimators=200))])

# Use cross validation to calculate the f1_score of the pipeline above.
lbgm_clf_score = cross_validate(boost_pipeline, X, y, scoring='f1_macro')

lbgm_clf_score

{'fit_time': array([8.84438562, 8.76633883, 8.81909418, 8.79916406, 8.78158092]),
 'score_time': array([0.78064919, 0.76951694, 0.76265335, 0.78558946, 0.7532897 ]),
 'test_score': array([0.66144233, 0.66000512, 0.65596638, 0.64262222, 0.65883131])}

### In the section above we used fastText word vectors created using the 'skipgram' unsupervised modeling technique in a Scikit-learn pipeline as an input other supervised models. 

### In this section we recreate the fastText word vectors, this time using the 'cbow' unsupervised modeling technique, and again use them as inputs to the same supervised models. 

In [None]:
# Location to save the file of learned word vector representations
word_vector_model_cbow_filepath = r"/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/fastText_Models/unsupervised_model_cbow.bin"

In [None]:
# Use a cbow model to create word vector representations of every word in the tweets.
unsupervised_model_cbow = fasttext.train_unsupervised(unlabeled_tweet_filepath, model='cbow')

# Save the learned word vector representations.
unsupervised_model_cbow.save_model(word_vector_model_cbow_filepath)

In [None]:
# Load the cbow fastText word vector model.
ft_model_cbow = load_model(word_vector_model_cbow_filepath)

In [None]:
# Create a dictionary mapping each unique word to its fastText word vector.
word_vector_dictionary_cbow = {word : ft_model_cbow.get_word_vector(word) for word in unique_words}

In [None]:
# Create a pipeline using our fastText mean embedding vectorizer and a stochastic gradient decent classifier.
sgd_model_pipeline_cbow = Pipeline([("ft_word_vectorizer", MeanEmbeddingVectorizer(word_vector_dictionary_cbow)),
                                    ("SGD_Clf", SGDClassifier())])

# Use cross validation to calculate the f1_score of the pipeline above.
sgd_clf_cbow_score = cross_validate(sgd_model_pipeline_cbow, X, y, scoring='f1_macro')

sgd_clf_cbow_score

{'fit_time': array([3.05295968, 3.99344063, 4.37372613, 3.13203883, 2.98276353]),
 'score_time': array([0.71279097, 0.96511126, 0.98486471, 0.7162931 , 0.70252132]),
 'test_score': array([0.48182998, 0.48182998, 0.48182998, 0.48178795, 0.48178795])}

In [None]:
# Create a pipeline using our fastText mean embedding vectorizer and a stochastic extra random forest classifier.
extra_tree_clf_pipeline_cbow = Pipeline([("ft_word_vectorizer", MeanEmbeddingVectorizer(word_vector_dictionary_cbow)),
                                         ("Extra_Tree_Clf", ExtraTreesClassifier(n_estimators=200))])

# Use cross validation to calculate the f1_score of the pipeline above.
extraTree_clf_cbow_score = cross_validate(extra_tree_clf_pipeline_cbow, X, y, scoring='f1_macro')

extraTree_clf_cbow_score

{'fit_time': array([11.97788239, 12.13678455, 11.87558174, 11.88341928, 12.41749525]),
 'score_time': array([1.05648184, 1.04730582, 1.03632307, 1.05318975, 1.03329182]),
 'test_score': array([0.68917414, 0.66787767, 0.67433512, 0.66750442, 0.6825124 ])}

In [None]:
# Create a pipeline using our fastText mean embedding vectorizer and a multi-layer perceptron classifier.
mlp_pipeline_cbow = Pipeline([("ft_word_vectorizer", MeanEmbeddingVectorizer(word_vector_dictionary_cbow)),
                              ("mlp_clf", MLPClassifier())])

# Use cross validation to calculate the f1_score of the pipeline above.
mlp_clf_score_cbow = cross_validate(mlp_pipeline_cbow, X, y, scoring='f1_macro')

mlp_clf_score_cbow

{'fit_time': array([72.05164409, 19.39311123, 50.95842099, 44.7173655 , 57.14424276]),
 'score_time': array([1.02342415, 0.79099369, 0.99399972, 1.0319469 , 0.97240925]),
 'test_score': array([0.48182998, 0.48182998, 0.48182998, 0.48178795, 0.48178795])}

In [None]:
# Create a pipeline using our fastText mean embedding vectorizer and a multi-layer perceptron classifier.
boost_pipeline_cbow = Pipeline([("ft_word_vectorizer", MeanEmbeddingVectorizer(word_vector_dictionary_cbow)),
                                ("lgbm_clf", LGBMClassifier(n_estimators=200))])

# Use cross validation to calculate the f1_score of the pipeline above.
lbgm_clf_score_cbow = cross_validate(boost_pipeline_cbow, X, y, scoring='f1_macro')

lbgm_clf_score_cbow

{'fit_time': array([8.30938506, 8.31539774, 8.37187767, 8.28513646, 8.25943112]),
 'score_time': array([0.77977586, 0.76945925, 0.75384355, 0.76340342, 0.7754581 ]),
 'test_score': array([0.63844614, 0.63746201, 0.63421533, 0.62205726, 0.64918614])}