<a href="https://colab.research.google.com/github/BradenAnderson/Twitter-Sentiment-Analysis/blob/main/05_fastText_parameters_from_auto_tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### This notebook starts exploring how to get the best parameters that were found by fastText auto tune from a trained model.

Some additional information on this topic can be found here:

https://github.com/facebookresearch/fastText/issues/913



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! git clone https://github.com/facebookresearch/fastText.git

Cloning into 'fastText'...
remote: Enumerating objects: 3854, done.[K
remote: Total 3854 (delta 0), reused 0 (delta 0), pack-reused 3854[K
Receiving objects: 100% (3854/3854), 8.22 MiB | 19.63 MiB/s, done.
Resolving deltas: 100% (2417/2417), done.


In [None]:
! pip install /content/fastText

Processing ./fastText
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3085711 sha256=a69d2476ad23b9f32b73bf1ae976705838629b8e438245a212f74ee9a95bf364
  Stored in directory: /tmp/pip-ephem-wheel-cache-qjoo7fpr/wheels/a1/9f/52/696ce6c5c46325e840c76614ee5051458c0df10306987e7443
Successfully built fasttext
Installing collected packages: fasttext
Successfully installed fasttext-0.9.2


In [None]:
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import multiprocessing

import fasttext
from fasttext.FastText import load_model

In [None]:
TEXT_COLUMN = 'text'
LABEL_COLUMN = 'label'
LABEL_SEPARATOR = '__label__'
PROBABILITY_COLUMN = 'p'
RANDOM_SEED = 42
VERBOSE = 3

In [None]:
train_parameters = {
    'lr': 0.1,
    'dim': 100,
    'ws': 5,
    'epoch': 5,
    'minCount': 1,
    'minCountLabel': 0,
    'minn': 0,
    'maxn': 0,
    'neg': 5,
    'wordNgrams': 1,
    'bucket': 2000000,
    'thread': multiprocessing.cpu_count() - 1,
    'lrUpdateRate': 100,
    't': 1e-4,
    'label': LABEL_SEPARATOR,
    'verbose': 2,
    'pretrainedVectors': '',
    'seed': 0,
}

def get_model_parameters(model):
    args_getter = model.f.getArgs()

    parameters = {}
    for param in train_parameters:
        attr = getattr(args_getter, param)
        if param == 'loss':
            attr = attr.name
        parameters[param] = attr

    return parameters

In [None]:
# Path to the model that was trained for 8.33 hours.
model_path = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/fastText_Models/autoTuned_longTrain_model.bin"

model = load_model(model_path)

In [None]:
# Use the function above to extract the parameter found by fastText.
parameters = get_model_parameters(model)

In [None]:
# Display the best parameters.
parameters

{'bucket': 0,
 'dim': 41,
 'epoch': 76,
 'label': '__label__',
 'lr': 0.05,
 'lrUpdateRate': 100,
 'maxn': 0,
 'minCount': 1,
 'minCountLabel': 0,
 'minn': 0,
 'neg': 5,
 'pretrainedVectors': '',
 'seed': 0,
 't': 0.0001,
 'thread': 12,
 'verbose': 2,
 'wordNgrams': 1,
 'ws': 5}

In [None]:
# Use the best parameters (shown above) to retrain the model on all of the data. (Commented out because this has been completed, now we can just load the model, see below.)
'''
# Path to a fastText formatted data file containing all of the data.
all_data_file = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/fastText_Models/all_data.txt"

all_data_retrained_8hr_best_params_model = fasttext.train_supervised(input=all_data_file,
                                                                     bucket=0,
                                                                     dim=41,
                                                                     epoch=76,
                                                                     lr=0.05,
                                                                     lrUpdateRate=100,
                                                                     maxn=0,
                                                                     minCount = 1,
                                                                     minCountLabel = 0,
                                                                     minn = 0, 
                                                                     neg = 5,
                                                                     seed = 0,
                                                                     t = 0.0001, 
                                                                     thread = 12,
                                                                     verbose = 2,
                                                                     wordNgrams = 1,
                                                                     ws = 5)
'''

In [None]:
model_filepath = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/fastText_Models/8hr_retrained_all_data_best_params.bin"

# Save the model that has been retrained on all of the data.
#all_data_retrained_8hr_best_params_model.save_model(model_filepath)

# Next time we run this notebook, we can just load the model that has been retrain on all of the data, with the best hyperparameters.
all_data_retrained_8hr_best_params_model = load_model(model_filepath)

# Load the previously unseen test data set (about 17k tweets).

The next few cells make sure the test data is formatted how fastText needs it.

In [None]:
filepath= "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/train_test_data/test_tweets_clean.csv"

tweet_df = pd.read_csv(filepath)

tweet_df.head()

Unnamed: 0,id,tweet,tweet_emoji_cleaned,Fully_Clean_Tweet_Tokenized,Clean_Tweet
0,31963,#studiolife #aislife #requires #passion #dedic...,#studiolife #aislife #requires #passion #dedic...,"['#studiolife', '#aislife', '#requires', '#pas...",#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...,@user #white #supremacists want everyone to s...,"['#white', '#supremacists', 'want', 'new', '#b...",#white #supremacists want new #birds #movie
2,31965,safe ways to heal your #acne!! #altwaystohe...,safe ways to heal your #acne!! #altwaystohe...,"['safe', 'way', 'heal', 'pron', '#acne', '#alt...",safe way heal pron #acne #altwaystoheal #healt...
3,31966,is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...,"['hp', 'curse', 'child', 'book', 'reservation'...",hp curse child book reservation yes happy love...
4,31967,"3rd #bihday to my amazing, hilarious #nephew...","3rd #bihday to my amazing, hilarious #nephew...","['3rd', '#bihday', 'pron', 'amazing', 'hilario...",3rd #bihday pron amazing hilarious #nephew eli...


In [None]:
fastText_df = tweet_df.loc[:, ['id', 'Clean_Tweet']].copy(deep=True)

fastText_df.head()

Unnamed: 0,id,Clean_Tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,#white #supremacists want new #birds #movie
2,31965,safe way heal pron #acne #altwaystoheal #healt...
3,31966,hp curse child book reservation yes happy love...
4,31967,3rd #bihday pron amazing hilarious #nephew eli...


In [None]:
# Prepare a set of index values and tweets. In order to submit the predictions to Analytics Vidhya for grading, we 
# will need need to reconstruct two columns. 1) The set of index values, 2) The set of predictions. 
# Therefore it is necessary for us to keep track of which index goes with which fastText prediction.

index_list = list(fastText_df['id'].to_numpy())

tweet_list = list(fastText_df['Clean_Tweet'].to_numpy())

tweet_data = [(index_list[i], tweet_list[i]) for i in range(0, len(index_list))]

In [None]:
# Dictionary to hold the predictions made by fastText.
predicted_classes = {}
predicted_classes['id'] = []
predicted_classes['label'] = []
predicted_classes['tweet_text'] = []

# Iterate over each sample in the test data, and have the fastText model make a prediction for it.
for test_tweet in tweet_data: 
  tweet_id, tweet_text = test_tweet

  prediction = all_data_retrained_8hr_best_params_model.predict(tweet_text)

  predicted_classes['id'].append(tweet_id)
  predicted_classes['label'].append(prediction)
  predicted_classes['tweet_text'].append(tweet_text)

In [None]:
# Reformat the predictions made by fastText for submission to analytics vidhya
prediction_df = pd.DataFrame(predicted_classes)

clean_label = []
for index in list(prediction_df.index): 
  label_cleaned = prediction_df['label'].to_numpy()[index][0][0].replace("__label__", "")[0]
  clean_label.append(label_cleaned)

prediction_df['clean_label'] = clean_label

prediction_df.drop(columns=['label', 'tweet_text'], inplace=True)

prediction_df.rename(columns={'clean_label' : 'label'}, inplace=True)

Unnamed: 0,id,label,tweet_text
0,31963,"((__label__0school,), [0.023810237646102905])",#studiolife #aislife #requires #passion #dedic...
1,31964,"((__label__0work,), [0.05442137271165848])",#white #supremacists want new #birds #movie
2,31965,"((__label__0safe,), [0.8979480266571045])",safe way heal pron #acne #altwaystoheal #healt...
3,31966,"((__label__0yes,), [0.053191687911748886])",hp curse child book reservation yes happy love...
4,31967,"((__label__0thank,), [0.2912096679210663])",3rd #bihday pron amazing hilarious #nephew eli...


In [None]:
# This is what we  will submit to analytics vidhya.
prediction_df.head()

Unnamed: 0,id,label
0,31963,0
1,31964,0
2,31965,0
3,31966,0
4,31967,0


In [None]:
# Save off the predictions file for submission
prediction_df.to_csv(path_or_buf="/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/test_data_prediction_files/Anderson_Submit_8hr_all_data_ft.csv", index=False)