### Creating a FastText labeled dataset

In [1]:
%load_ext autoreload

import numpy as np 
import pandas as pd 
import helpers 

CREATE_NEW_DATASET = False # If need to create a labeled text file for FastText 
if CREATE_NEW_DATASET:
    
    # Load training set
    pos_filename = 'twitter-datasets/train_pos_full_u.txt'
    neg_filename = 'twitter-datasets/train_neg_full_u.txt'
    pos_tweets = helpers.txt_to_list(pos_filename)
    neg_tweets = helpers.txt_to_list(neg_filename)
    
    # Create a labeled dataset 
    all_tweets, y = helpers.merge_shuffle_label(pos_tweets, neg_tweets)
    
    # Split into train and validation sets
    training_fraction = 0.8
    train, val, y_train, y_val = helpers.split_dataset(training_fraction, all_tweets, y)
    
    # Create a labeled text files for supervised FastText
    labeled_filename_full = 'twitter-datasets/full_u_labeled.txt'
    labeled_filename_train = 'twitter-datasets/train_u_labeled.txt'
    labeled_filename_val = 'twitter-datasets/val_u_labeled.txt'

    write_labeled(labeled_filename_full, all_tweets, y)
    write_labeled(labeled_filename_train, train, y_train)
    write_labeled(labeled_filename_val, val, y_val)


### Evaluating validation performance

In [2]:
import fasttext

# Filenames 
labeled_filename_full = 'twitter-datasets/full_u_labeled.txt'
labeled_filename_train = 'twitter-datasets/train_u_labeled.txt'
labeled_filename_val = 'twitter-datasets/val_u_labeled.txt'


model = fasttext.train_supervised(labeled_filename_train, epoch = 1, dim = 100, wordNgrams = 2, ws = 5, lr = 1)

def print_results(N, p, r):
    print("Precision {:.3f}".format(p))
    print("Recall {:.3f}".format(r))

print_results(*model.test(labeled_filename_val))

Precision 0.851
Recall 0.851


In [4]:
# Train full model
model = fasttext.train_supervised(labeled_filename_full, epoch = 10, dim = 100, wordNgrams = 2, lr = 1)

# Create test set for FastText
test_filename = 'twitter-datasets/test_data.txt'
test_tweets = []
with open(test_filename, encoding = 'utf-8') as f:
    for line in f:
        sp = line.split(',')
        index = sp[0]
        test_tweets.append(','.join(sp[1:])[:-1]) # Remove index and \n
        
# Generate predictions
res = {'__label__0': 0, '__label__1': 1}
predictions = np.array([res[el[0]] for el in model.predict(test_tweets, k=1)[0]])

# Save predictions
save_filename = 'submissions/submission_fasttext.csv'
helpers.save_pred(save_filename, predictions)