# Tweet Sentiment Analysis using fastText

## Imports and setup

In [78]:
#SETUP - You need these only the first time you run the code
#!git clone https://github.com/facebookresearch/fastText.git
#cd fastText
#!make

In [79]:
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')

In [80]:
import fasttext
import pandas as pd
import numpy as np
import ft_helpers as fth
import preprocessing as pp
import nltk

In [81]:
#help(fasttext.FastText)

## Select your (hyper)parameters here

In [82]:
# If selected false, only tests on validation data, otherwise creates submission .csv
CREATE_SUBMISSION = True 

# Select if you want to use preprocessed data or not
PREPROCESS = False

# Select the number of ngrams you want to use
NGRAMS = 3 

# Choose a submission file postfix (so that you don't overwrite your results)
SUBMISSION_POSTFIX = '_3grams' 

## Load data

In [83]:
tweets, test = fth.load_data(full=True)

In [84]:
if PREPROCESS == True:
    tweets['body'] = pp.preprocess_data(tweets['body'])
    test['body'] = pp.preprocess_data(test['body'])

## Train-test split

In [85]:
train, val = fth.train_val_split(tweets['body'], tweets['label'], 0.2, 42)

## Model training

In [86]:
train, val, test = fth.reindex_dfs(CREATE_SUBMISSION, train, val, test)

In [87]:
train_txt, val_txt, test_txt = fth.save_txt(train, val, test, SUBMISSION_POSTFIX, CREATE_SUBMISSION)

In [88]:
model = fasttext.train_supervised(input=train_txt, lr=0.1, epoch=2, wordNgrams=NGRAMS)

## Model testing

## On the validation set

In [89]:
model.test(val_txt)

(500000, 0.869858, 0.869858)

## On the test set

In [90]:
# Can be used for ensemble classifiers
predictions, probabilities = fth.get_prediction_probabilities(model, test)
probabilities[0:5]

[0.99518883228302,
 0.5997893810272217,
 0.5740494132041931,
 0.9601660966873169,
 0.9867045879364014]

In [91]:
if CREATE_SUBMISSION == True:
    filename = 'output' + SUBMISSION_POSTFIX + '.csv'
    fth.create_csv_submission(model, test, filename)

## Hyperparameter Tuning

### Number of epochs

In [35]:
for i in range(1,10):
    model = fasttext.train_supervised(input=train_txt, lr=1.0, epoch=i, wordNgrams=NGRAMS)
    print(i, model.test(val_txt))

1 (500000, 0.864954, 0.864954)
2 (500000, 0.86699, 0.86699)
3 (500000, 0.860868, 0.860868)
4 (500000, 0.85543, 0.85543)
5 (500000, 0.85203, 0.85203)
6 (500000, 0.850778, 0.850778)
7 (500000, 0.850452, 0.850452)
8 (500000, 0.84953, 0.84953)
9 (500000, 0.849226, 0.849226)


### Learning rate

In [33]:
i = 0.1
while i <= 1.0:
    model = fasttext.train_supervised(input=train_txt, lr=i, epoch=2, wordNgrams=NGRAMS)
    print(i, model.test(val_txt))
    i += 0.1

0.1 (500000, 0.868022, 0.868022)
0.2 (500000, 0.867472, 0.867472)
0.30000000000000004 (500000, 0.867402, 0.867402)
0.4 (500000, 0.867426, 0.867426)
0.5 (500000, 0.867228, 0.867228)
0.6 (500000, 0.86709, 0.86709)
0.7 (500000, 0.867224, 0.867224)
0.7999999999999999 (500000, 0.867068, 0.867068)
0.8999999999999999 (500000, 0.86718, 0.86718)
0.9999999999999999 (500000, 0.867096, 0.867096)


### 'n'-grams

In [34]:
for i in range(1,7):
    model = fasttext.train_supervised(input=train_txt, lr=0.1, epoch=2, wordNgrams=i)
    print(i, model.test(val_txt))

1 (500000, 0.833644, 0.833644)
2 (500000, 0.865526, 0.865526)
3 (500000, 0.86983, 0.86983)
4 (500000, 0.867922, 0.867922)
5 (500000, 0.865792, 0.865792)
6 (500000, 0.863836, 0.863836)


## Results

### Trying different number of epochs:
- 1 (500000, 0.864954, 0.864954)
- 2 (500000, 0.86699, 0.86699)
- 3 (500000, 0.860868, 0.860868)
- 4 (500000, 0.85543, 0.85543)
- 5 (500000, 0.85203, 0.85203)
- 6 (500000, 0.850778, 0.850778)
- 7 (500000, 0.850452, 0.850452)
- 8 (500000, 0.84953, 0.84953)
- 9 (500000, 0.849226, 0.849226)

### Trying different learning rates:
- 0.1 (500000, 0.868022, 0.868022)
- 0.2 (500000, 0.867472, 0.867472)
- 0.30000000000000004 (500000, 0.867402, 0.867402)
- 0.4 (500000, 0.867426, 0.867426)
- 0.5 (500000, 0.867228, 0.867228)
- 0.6 (500000, 0.86709, 0.86709)
- 0.7 (500000, 0.867224, 0.867224)
- 0.7999999999999999 (500000, 0.867068, 0.867068)
- 0.8999999999999999 (500000, 0.86718, 0.86718)
- 0.9999999999999999 (500000, 0.867096, 0.867096)

### Trying different number of ngrams:
- 1 (500000, 0.833644, 0.833644)
- 2 (500000, 0.865526, 0.865526)
- 3 (500000, 0.86983, 0.86983)
- 4 (500000, 0.867922, 0.867922)
- 5 (500000, 0.865792, 0.865792)
- 6 (500000, 0.863836, 0.863836)

To continue, apply the methods suggested in this tutorial: https://fasttext.cc/docs/en/supervised-tutorial.html

More FastText documentation here: https://fasttext.cc