### Creating a FastText labeled dataset

In [1]:
%load_ext autoreload

import numpy as np 
import pandas as pd 
import helpers 

CREATE_NEW_DATASET = False # If need to create a labeled text file for FastText 
if CREATE_NEW_DATASET:
    
    # Load training set
    pos_filename = 'twitter-datasets/train_pos_full_u.txt'
    neg_filename = 'twitter-datasets/train_neg_full_u.txt'
    pos_tweets = helpers.txt_to_list(pos_filename)
    neg_tweets = helpers.txt_to_list(neg_filename)
    
    # Create a labeled dataset 
    all_tweets, y = helpers.merge_shuffle_label(pos_tweets, neg_tweets)
    
    # Split into train and validation sets
    training_fraction = 0.8
    train, val, y_train, y_val = helpers.split_dataset(training_fraction, all_tweets, y)
    
    # Create a labeled text files for supervised FastText
    labeled_filename_full = 'twitter-datasets/full_u_labeled.txt'
    labeled_filename_train = 'twitter-datasets/train_u_labeled.txt'
    labeled_filename_val = 'twitter-datasets/val_u_labeled.txt'

    write_labeled(labeled_filename_full, all_tweets, y)
    write_labeled(labeled_filename_train, train, y_train)
    write_labeled(labeled_filename_val, val, y_val)


### Evaluating validation performance

In [2]:
import fasttext

# Filenames 
labeled_filename_full = 'twitter-datasets/full_u_labeled.txt'
labeled_filename_train = 'twitter-datasets/train_u_labeled.txt'
labeled_filename_val = 'twitter-datasets/val_u_labeled.txt'


model = fasttext.train_supervised(labeled_filename_train, epoch = 1, dim = 100, wordNgrams = 2, ws = 5, lr = 1)

def print_results(N, p, r):
    print("Precision {:.3f}".format(p))
    print("Recall {:.3f}".format(r))

print_results(*model.test(labeled_filename_val))

Precision 0.851
Recall 0.851


In [3]:
import helpers

val_tweets = helpers.txt_to_list(labeled_filename_val)
val_clean_tweets = [el[11:] for el in val_tweets]
val_labels = [int(el[9]) for el in val_tweets]

In [4]:
pred = model.predict(val_clean_tweets, k=1)
confidence = [el[0] for el in pred[1]]
res = {'__label__0': 0, '__label__1': 1}
predicted_label = [res[el[0]] for el in pred[0]]

In [5]:
import pandas as pd 

df = pd.DataFrame.from_dict(dict(zip(['tweet', 'label', 'prediction', 'confidence'], [val_clean_tweets, val_labels, predicted_label, confidence])))
df.set_index('tweet', inplace = True)
n_ = 10
print('{:d} most confident correct predictions of positive tweets'.format(n_))
df.query('label == prediction == 1').sort_values(by= 'confidence', ascending = False).head(n_)

10 most confident correct predictions of positive tweets


Unnamed: 0_level_0,label,prediction,confidence
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
#yougetmajorpointsif you have an amazing smile,1,1,1.00001
niall eating ) #ifindthatattractive,1,1,1.00001
<user> coool ! ) sure sure ! ) hahaha thanks ys ),1,1,1.00001
<user> thankyou ) xx,1,1,1.00001
blessed blessed blessed blessed blessed,1,1,1.00001
#yougetmajorpointsif you have an awesome smile,1,1,1.00001
<user> hey ),1,1,1.00001
<user> #ifindthatattractive ),1,1,1.00001
pa.shou out ) ) thanks ),1,1,1.00001
<user> thx ! ),1,1,1.00001


In [6]:
print('{:d} most confident incorrect predictions of positive tweets'.format(n_))
df.query('label == 1 & prediction == 0').sort_values(by= 'confidence', ascending = False).head(n_)

10 most confident incorrect predictions of positive tweets


Unnamed: 0_level_0,label,prediction,confidence
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
headache and my stomach hurts,1,0,1.000009
ohheythere ... <url>,1,0,1.000009
<user> hopefully ( . dontt hurt her ( :,1,0,1.000008
slight headache but nooo hangover,1,0,1.000004
the headache is gone,1,0,0.999999
i been thinking bout uu ( < / 3,1,0,0.999993
without me,1,0,0.999993
"<user> is sad , poor baby",1,0,0.999993
i feel hurt really hurt,1,0,0.999988
<user> that sucks,1,0,0.999987


In [7]:
print('{:d} most confident correct predictions of negative tweets'.format(n_))
df.query('label == prediction == 0').sort_values(by= 'confidence', ascending = False).head(n_)

10 most confident correct predictions of negative tweets


Unnamed: 0_level_0,label,prediction,confidence
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
how to find gold ( paperback <url>,0,0,1.00001
insights on judo ( paperback <url>,0,0,1.00001
the evolution handbook ( paperback <url>,0,0,1.00001
find a falling star ( paperback <url>,0,0,1.00001
born of betrayal ( paperback <url>,0,0,1.00001
i cried when rue died (,0,0,1.00001
feast of fire ( paperback <url>,0,0,1.00001
the jennifer project ( paperback <url>,0,0,1.00001
drums of autumn ( paperback <url>,0,0,1.00001
when the ocotillo bloom ( paperback <url>,0,0,1.00001


In [8]:
print('{:d} most confident incorrect predictions of negative tweets'.format(n_))
df.query('label == 0 & prediction == 1').sort_values(by= 'confidence', ascending = False).head(n_)


10 most confident incorrect predictions of negative tweets


Unnamed: 0_level_0,label,prediction,confidence
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
<user> ),0,1,1.00001
<user> thanks megan ! ) x,0,1,1.000008
- this song ),0,1,1.000006
<user> thanx,0,1,1.000001
cuddling with <user> #jk #shehatescuddling ),0,1,0.999989
rt this for shoutout <---> mustbefollowingme ),0,1,0.999989
<user> nf <user> u 2 ),0,1,0.999978
hey <user> thanks for the ff . happy friday ),0,1,0.99997
<user> ohdohsadhasodoiasdioadoasdhso aiiie ) ),0,1,0.999964
<user> who doing it ! ),0,1,0.99996


In [9]:
# Train full model
model = fasttext.train_supervised(labeled_filename_full, epoch = 10, dim = 100, wordNgrams = 2, lr = 1)

# Create test set for FastText
test_filename = 'twitter-datasets/test_data.txt'
test_tweets = []
with open(test_filename, encoding = 'utf-8') as f:
    for line in f:
        sp = line.split(',')

        test_tweets.append(','.join(sp[1:])[:-1]) # Remove index and \n
        
# Generate predictions
predictions = np.array([res[el[0]] for el in model.predict(test_tweets, k=1)[0]])

# Save predictions
save_filename = 'submissions/submission_fasttext.csv'
helpers.save_pred(save_filename, predictions)

In [11]:
model.save_model("models/fasttext_model_full_u.bin")

In [18]:
model2 = fasttext.load_model("models/fasttext_model_full_u.bin")
model2.get_sentence_vector('This aint a real tweet')

array([ 0.07232376,  0.01095037, -0.06820733, -0.02932014,  0.28660995,
       -0.12961845,  0.17584221, -0.00178165,  0.02339283, -0.02009064,
        0.03460643, -0.16027077,  0.00555737, -0.00977445,  0.04991233,
        0.10164199, -0.12044072,  0.09444636, -0.11358877, -0.14314677,
       -0.2883728 , -0.00561005,  0.03324628, -0.03244628, -0.03310582,
       -0.15512647, -0.04873398,  0.14345992,  0.12041344,  0.00668113,
       -0.2013052 , -0.12171178,  0.04832705, -0.08066746,  0.08763087,
       -0.15001483, -0.08204266, -0.06245921,  0.06522036, -0.05996551,
       -0.05209793, -0.07358738,  0.1995633 ,  0.01043093,  0.08625706,
        0.17856342, -0.04943788, -0.05338828,  0.10366135,  0.06162642,
        0.05761834, -0.15901682, -0.09433366, -0.0728903 , -0.24006994,
       -0.00135586, -0.21591774, -0.07960388,  0.04337596,  0.01752023,
        0.02447683,  0.00912572,  0.07488441, -0.00126678, -0.11501414,
       -0.03596577, -0.04741618,  0.04333788, -0.08995944, -0.08