In [1]:
#SETUP - You need these only the first time you run the code
#!git clone https://github.com/facebookresearch/fastText.git
#cd fastText
#!make

In [12]:
import fasttext
import pandas as pd
import numpy as np
import ft_helpers as fth

In [13]:
help(fasttext.FastText)

Help on module fasttext.FastText in fasttext:

NAME
    fasttext.FastText

DESCRIPTION
    # Copyright (c) 2017-present, Facebook, Inc.
    # All rights reserved.
    #
    # This source code is licensed under the MIT license found in the
    # LICENSE file in the root directory of this source tree.

FUNCTIONS
    cbow(*kargs, **kwargs)
    
    eprint(*args, **kwargs)
    
    load_model(path)
        Load a model given a filepath and return a model object.
    
    read_args(arg_list, arg_dict, arg_names, default_values)
    
    skipgram(*kargs, **kwargs)
    
    supervised(*kargs, **kwargs)
    
    tokenize(text)
        Given a string of text, tokenize it and return a list of tokens
    
    train_supervised(*kargs, **kwargs)
        Train a supervised model and return a model object.
        
        input must be a filepath. The input text does not need to be tokenized
        as per the tokenize function, but it must be preprocessed and encoded
        as UTF-8. You might wan

In [14]:
# Load data
tweets, test = fth.load_data(full=True)

# Split training data into training and validation sets
train, val = fth.train_val_split(tweets['body'], tweets['label'], 0.3, 42)

In [15]:
# Reindex the dataframe for fasttext format
train = fth.reindex_df(train)
val = fth.reindex_df(val)

In [16]:
# Save dataframes into .txt for fasttext
np.savetxt(r'train_full.txt', train.values, fmt='%s')
np.savetxt(r'val_full.txt', val.values, fmt='%s')

In [17]:
# Train fasttext model on training set
model = fasttext.train_supervised(input="train_full.txt", lr=1.0, epoch=1, wordNgrams=4)

In [18]:
# Test the model on validation set
model.test("val_full.txt")

(750000, 0.8638973333333333, 0.8638973333333333)

## Results

### w/o hyperparameter tuning:
- dataset=small: (60000, 0.82505, 0.82505)
- dataset=full: (750000, 0.8355986666666667, 0.8355986666666667)


### w/ some hyperparameter tuning:
- dataset=small, lr=1.0, epoch=1, wordNgrams=3: (60000, 0.8371166666666666, 0.8371166666666666)
- dataset=full, lr=1.0, epoch=1, wordNgrams=3: (750000, 0.8639346666666666, 0.8639346666666666)

- dataset=full, lr=0.3, epoch=1, wordNgrams=3: (750000, 0.86366, 0.86366)
- dataset=full, lr=0.3, epoch=20, wordNgrams=3: (750000, 0.847332, 0.847332)

- dataset=full, lr=1.0, epoch=1, wordNgrams=4: (750000, 0.86412, 0.86412) 
- dataset=full, lr=1.0, epoch=1, wordNgrams=5: (750000, 0.8628946666666667, 0.8628946666666667)
- dataset=full, lr=1.0, epoch=1, wordNgrams=6: (750000, 0.8615613333333333, 0.8615613333333333)
- dataset=full, lr=1.0, epoch=1, wordNgrams=7: (750000, 0.860692, 0.860692)
- dataset=full, lr=1.0, epoch=1, wordNgrams=8: (750000, 0.859468, 0.859468)

- dataset=full, lr=1.0, epoch=100, wordNgrams=4: 750000, 0.8457546666666667, 0.8457546666666667)

- dataset=full, lr=1.0, epoch=1: (750000, 0.8324173333333333, 0.8324173333333333)
- dataset=full, lr=1.0, epoch=10: (750000, 0.792976, 0.792976)

Best parameters: lr=1.0, epoch=1, ngrams=4

- To continue, apply the methods suggested in this tutorial: https://fasttext.cc/docs/en/supervised-tutorial.html
- More FastText documentation here: https://fasttext.cc

### TODO: 
1. Try more ngrams and more hyperparameters --done 
2. Comment more -- done
3. Create more functions (make the code more modular) --> e.g. another function for train test splitting alone -- done
3. Try with the preprocessed data 
4. Create submission csv and submit to aicrowd