In [4]:
#SETUP - You need these only the first time you run the code
#!git clone https://github.com/facebookresearch/fastText.git
#cd fastText
#!make

In [5]:
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')

In [6]:
import fasttext
import pandas as pd
import numpy as np
import ft_helpers as fth
import preprocessing as pp
import nltk

In [7]:
#help(fasttext.FastText)

In [5]:
'''Select your (hyper)parameters here'''

# If selected false, only tests on validation data, otherwise creates submission .csv
CREATE_SUBMISSION = True 

# Select if you want to use preprocessed data or not
PREPROCESS = False

# Select the number of ngrams you want to use
NGRAMS = 4 

# Choose a submission file postfix (so that you don't overwrite your results)
SUBMISSION_POSTFIX = '_full' 

In [6]:
tweets, test = fth.load_data(full=True)

In [7]:
if PREPROCESS == True:
    tweets['body'] = pp.preprocess_data(tweets['body'])
    test['body'] = pp.preprocess_data(test['body'])

In [8]:
train, val = fth.train_val_split(tweets['body'], tweets['label'], 0.2, 42)

In [9]:
train, val, test = fth.reindex_dfs(CREATE_SUBMISSION, train, val, test)

In [10]:
train_txt, val_txt, test_txt = fth.save_txt(train, val, test, SUBMISSION_POSTFIX, CREATE_SUBMISSION)

In [11]:
model = fasttext.train_supervised(input=train_txt, lr=1.0, epoch=1, wordNgrams=NGRAMS)

In [12]:
model.test(val_txt)

(500000, 0.865314, 0.865314)

In [13]:
# Can be used for ensemble classifiers
predictions, probabilities = fth.get_prediction_probabilities(model, test)
probabilities[0:5]

[0.9898786544799805,
 0.5681026577949524,
 0.6663926839828491,
 0.832999050617218,
 0.9534883499145508]

In [14]:
if CREATE_SUBMISSION == True:
    filename = 'output' + SUBMISSION_POSTFIX + '.csv'
    fth.create_csv_submission(model, test, filename)

## Results

### w/o hyperparameter tuning:

- dataset=small: (60000, 0.82505, 0.82505)
- dataset=full: (750000, 0.8355986666666667, 0.8355986666666667)

### w/ some hyperparameter tuning:
- dataset=small, lr=1.0, epoch=1, wordNgrams=3: (60000, 0.8371166666666666, 0.8371166666666666)
- dataset=full, lr=1.0, epoch=1, wordNgrams=3: (750000, 0.8639346666666666, 0.8639346666666666)
- dataset=full, lr=0.3, epoch=1, wordNgrams=3: (750000, 0.86366, 0.86366)
- dataset=full, lr=0.3, epoch=20, wordNgrams=3: (750000, 0.847332, 0.847332)
- dataset=full, lr=1.0, epoch=1, wordNgrams=4: (750000, 0.86412, 0.86412)
- dataset=full, lr=1.0, epoch=1, wordNgrams=5: (750000, 0.8628946666666667, 0.8628946666666667)
- dataset=full, lr=1.0, epoch=1, wordNgrams=6: (750000, 0.8615613333333333, 0.8615613333333333)
- dataset=full, lr=1.0, epoch=1, wordNgrams=7: (750000, 0.860692, 0.860692)
- dataset=full, lr=1.0, epoch=1, wordNgrams=8: (750000, 0.859468, 0.859468)
- dataset=full, lr=1.0, epoch=100, wordNgrams=4: 750000, 0.8457546666666667, 0.8457546666666667)
- dataset=full, lr=1.0, epoch=1: (750000, 0.8324173333333333, 0.8324173333333333)
- dataset=full, lr=1.0, epoch=10: (750000, 0.792976, 0.792976)
- Best parameters: lr=1.0, epoch=1, ngrams=4

To continue, apply the methods suggested in this tutorial: https://fasttext.cc/docs/en/supervised-tutorial.html

More FastText documentation here: https://fasttext.cc