In [143]:
#SETUP - You need these only the first time you run the code
#!git clone https://github.com/facebookresearch/fastText.git
#cd fastText
#!make

In [144]:
import fasttext
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [145]:
help(fasttext.FastText)

Help on module fasttext.FastText in fasttext:

NAME
    fasttext.FastText

DESCRIPTION
    # Copyright (c) 2017-present, Facebook, Inc.
    # All rights reserved.
    #
    # This source code is licensed under the MIT license found in the
    # LICENSE file in the root directory of this source tree.

FUNCTIONS
    cbow(*kargs, **kwargs)
    
    eprint(*args, **kwargs)
    
    load_model(path)
        Load a model given a filepath and return a model object.
    
    read_args(arg_list, arg_dict, arg_names, default_values)
    
    skipgram(*kargs, **kwargs)
    
    supervised(*kargs, **kwargs)
    
    tokenize(text)
        Given a string of text, tokenize it and return a list of tokens
    
    train_supervised(*kargs, **kwargs)
        Train a supervised model and return a model object.
        
        input must be a filepath. The input text does not need to be tokenized
        as per the tokenize function, but it must be preprocessed and encoded
        as UTF-8. You might wan

In [146]:
DATA_PATH = './data/'

def load_data_fasttext(full = True):
    """
    Loads the Twitter data.
    
    Args:
    full (bool): if False, loads only a part of the data
    
    Returns:
    tweets (pandas dataframe): positive and negative tweets with labels
    test_data: unlabelled data for testing
    """
    
    FULL = ''  
    if full:
        FULL = '_full'
        
    POS_TWEETS = DATA_PATH + 'train_pos' + FULL + '.txt'
    NEG_TWEETS = DATA_PATH + 'train_neg' + FULL + '.txt'
    TEST_DATA = DATA_PATH + 'test_data.txt'
    
    with open(POS_TWEETS) as file:
        pos_tweets_data = [line.rstrip() for line in file]
    pos_tweets = pd.DataFrame(pos_tweets_data, columns=['body'])
    pos_tweets['label'] = "__label__happyface"
    pos_tweets = reindex_df(pos_tweets)
    
    with open(NEG_TWEETS) as file:
        neg_tweets_data = [line.rstrip() for line in file]
    neg_tweets = pd.DataFrame(neg_tweets_data, columns=['body'])
    neg_tweets['label'] = "__label__sadface"
    neg_tweets = reindex_df(neg_tweets)

    with open(TEST_DATA) as file:
        # removes id at the same time
        test_data = [line.rstrip().split(',', 1)[1] for line in file]

    test = pd.DataFrame(test_data, columns=['body'])

    # take 70% as train data, 30% as validation data (equal pos and neg)
    #pos_X_train, pos_X_test, pos_y_train, pos_y_test = train_test_split(pos_tweets['body'], pos_tweets['label'], test_size=0.3, random_state=42)
    #neg_X_train, neg_X_test, neg_y_train, neg_y_test = train_test_split(neg_tweets['body'], neg_tweets['label'], test_size=0.3, random_state=42)
    
    # merge positive and negative datasets
    tweets = pd.concat([pos_tweets, neg_tweets], axis = 0)
    
    # split into train and validation
    X_train, X_val, y_train, y_val = train_test_split(tweets['body'], tweets['label'], test_size=0.3, random_state=42)
    
    train = pd.concat([X_train, y_train], axis = 1)
    val = pd.concat([X_val, y_val], axis = 1)
    
    train = reindex_df(train)
    val = reindex_df(val)
    
    return tweets, train, val, X_train, X_val, y_train, y_val, test

In [147]:
def reindex_df(df):
    """
    Reindexes a given dataframe for the FastText format (i.e. label first, body second)
    
    Args:
    df (pandas dataframe): tweets with columns indexed as ['body', 'label']
    
    Returns:
    df_reindexed (pandas dataframe): tweets with columns indexed as ['body', 'label']
    """
    
    columnsTitles = ['label', 'body'] 
    df_reindexed = df.reindex(columns=columnsTitles)
    
    return df_reindexed

In [163]:
tweets, train, val, X_train, X_val, y_train, y_val, test = load_data_fasttext(full=True)

In [164]:
np.savetxt(r'train_full.txt', train.values, fmt='%s')
np.savetxt(r'val_full.txt', val.values, fmt='%s')

In [165]:
model = fasttext.train_supervised(input="train_full.txt", lr=1.0, epoch=1, wordNgrams=3)

In [166]:
model.test("val_full.txt")

(750000, 0.8639346666666666, 0.8639346666666666)

w/o hyperparameter tuning:

dataset=small: (60000, 0.82505, 0.82505)
dataset=full: (750000, 0.8355986666666667, 0.8355986666666667)

w/ some hyperparameter tuning:

dataset=small, lr=1.0, epoch=1, wordNgrams=3: (60000, 0.8371166666666666, 0.8371166666666666)

dataset=full, lr=1.0, epoch=1, wordNgrams=3: (750000, 0.8639346666666666, 0.8639346666666666)