# Tweet Sentiment Analysis using fastText

## Imports and setup

In [None]:
#SETUP - You need these only the first time you run the code
#!git clone https://github.com/facebookresearch/fastText.git
#cd fastText
#!make

In [None]:
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')

In [None]:
import fasttext
import pandas as pd
import numpy as np
import ft_helpers as fth
import preprocessing as pp
import nltk

In [None]:
#help(fasttext.FastText)

## Select your (hyper)parameters here

In [None]:
# If selected false, only tests on validation data, otherwise creates submission .csv
CREATE_SUBMISSION = True 

# If selected true, creates a .csv file with the prediction probability for each tweet
GET_PROBABILITIES = True

# Select if you want to use preprocessed data or not
PREPROCESS = False

# Select the number of ngrams you want to use
NGRAMS = 3 

# Choose a submission file postfix (so that you don't overwrite your results)
SUBMISSION_POSTFIX = '_' + str(NGRAMS) + 'grams_50epochs'

## Load data

In [None]:
tweets, test = fth.load_data(full=True)

In [None]:
if PREPROCESS == True:
    tweets['body'] = pp.preprocess_data(tweets['body'])
    test['body'] = pp.preprocess_data(test['body'])

## Train-test split

In [None]:
train, val = fth.train_val_split(tweets['body'], tweets['label'], 0.2, 42)

## Model training

In [None]:
# Reindex the dataframes according to fasttext's format
train, val, test = fth.reindex_dfs(CREATE_SUBMISSION, train, val, test)

In [None]:
# Create data .txt files to be used in fasttext model
train_txt, val_txt, test_txt = fth.save_txt(train, val, test, SUBMISSION_POSTFIX, CREATE_SUBMISSION)

In [None]:
model = fasttext.train_supervised(input=train_txt, lr=0.15, epoch=2, wordNgrams=NGRAMS)

## Model testing

## On the validation set

In [None]:
model.test(val_txt)

## On the test set

In [None]:
if CREATE_SUBMISSION == True:
    filename = 'output' + SUBMISSION_POSTFIX + '.csv'
    fth.create_csv_submission(model, test, filename)
    
if GET_PROBABILITIES == True:
    filename = 'prob' + SUBMISSION_POSTFIX + '.csv'
    fth.create_probabilities_csv(model, test, filename)

## Hyperparameter Tuning

### Number of epochs

In [None]:
for i in range(1,10):
    model = fasttext.train_supervised(input=train_txt, lr=1.0, epoch=i, wordNgrams=NGRAMS)
    print(i, model.test(val_txt))

**number of epochs, F1-score**
- 1, 0.864954
- **2, 0.86699**
- 3, 0.860868
- 4, 0.85543
- 5, 0.85203
- 6, 0.850778
- 7, 0.850452
- 8, 0.84953
- 9, 0.849226

### Learning rate

In [None]:
i = 0.05
while i <= 1.0:
    model = fasttext.train_supervised(input=train_txt, lr=i, epoch=2, wordNgrams=NGRAMS)
    print(i, model.test(val_txt))
    i += 0.05

**learning rate, F1-score**

- **0.1, 0.868022**
- 0.2, 0.867472
- 0.3, 0.867402
- 0.4, 0.867426
- 0.5, 0.867228
- 0.6, 0.86709
- 0.7, 0.867224
- 0.8, 0.867068
- 0.9, 0.86718
- 1.0, 0.867068

### 'n'-grams

In [None]:
for i in range(1,7):
    model = fasttext.train_supervised(input=train_txt, lr=0.1, epoch=2, wordNgrams=i)
    print(i, model.test(val_txt))

**n, F1-score**
- 1, 0.833644
- 2, 0.865526
- **3, 0.86983**
- 4, 0.867922
- 5, 0.865792
- 6, 0.863836

### fastText's automatic hyperparameter optimization

In [None]:
model_auto = fasttext.train_supervised(input=train_txt, lr=0.1, epoch=2, wordNgrams=NGRAMS, autotuneValidationFile=val_txt)

In [None]:
model_auto.test(val_txt)

Model created using fastText's automatic hyperparameter optimization gives accuracy 86.4% and F1-score 86.6 on AICrowd.

In [None]:
fth.create_csv_submission(model_auto, test, 'ft_completely_auto.csv')

In [None]:
fth.create_probabilities_csv(model_auto, test, 'ft_completely_auto_prob.csv')

## Ensemble models

## Using only  n-grams

In [None]:
# Read ngram model outputs
df_2grams = pd.read_csv("output_2grams.csv") 
df_3grams = pd.read_csv("output_3grams.csv") 
df_4grams = pd.read_csv("output_4grams.csv") 
df_5grams = pd.read_csv("output_5grams.csv") 
df_6grams = pd.read_csv("output_6grams.csv") 

In [None]:
# Create an ensemble model using majority voting
df_ensemble = df_2grams.copy()
df_ensemble['Id'] = df_2grams.index + 1
df_ensemble['Prediction'] = (df_2grams['Prediction'] + df_3grams['Prediction'] + df_4grams['Prediction'] + df_5grams['Prediction'] + df_6grams['Prediction']).apply(lambda x: fth.sign(x))

In [None]:
# Save dataframe into csv    
df_ensemble.to_csv('ensemble_ngrams.csv', sep=",", index=False)

Ensemble of 2,3,4,5,6-grams gives accuracy 86.5% and F1-score 86.7 on AICrowd. This is 0.3% better accuracy than using the best single classifier, i.e. 3-grams.

## Using n-grams + automatic hyperparameter optimization

In [None]:
# Read automatically optimized model output
df_auto = pd.read_csv("ft_auto.csv") 

#Create an ensemble model using majority voting
df_ensemble['Prediction'] = (df_2grams['Prediction'] + 2*df_3grams['Prediction'] + df_4grams['Prediction'] + df_5grams['Prediction'] + df_6grams['Prediction'] + df_auto['Prediction']).apply(lambda x: sign(x))

In [None]:
# Save dataframe into csv    
df_ensemble.to_csv('ensemble_with_auto_3gramsemphasis.csv', sep=",", index=False)

Ensemble of 2,3,4,5,6-grams and automatic hyperparameter optimized model, with an emphasis on the automatic model gives accuracy 86.5% and F1-score 86.7 on AICrowd. No improvement from the ensemble without the automatic model.

Ensemble of 2,3,4,5,6-grams and automatic hyperparameter optimized model, with an emphasis on the 3-grams model gives accuracy 86.5% and F1-score 86.8 on AICrowd. Emphasizing the best single model slightly improves the F1-score.