### Creating a FastText labeled dataset

In [5]:
%load_ext autoreload

import numpy as np 
import pandas as pd 
import helpers 

CREATE_NEW_DATASET = False # If need to create a labeled text file for FastText 
if CREATE_NEW_DATASET:
    
    # Load training set
    pos_filename = 'twitter-datasets/train_pos_full_u.txt'
    neg_filename = 'twitter-datasets/train_neg_full_u.txt'
    pos_tweets = helpers.txt_to_list(pos_filename)
    neg_tweets = helpers.txt_to_list(neg_filename)
    
    # Create a labeled dataset 
    all_tweets, y = helpers.merge_shuffle_label(pos_tweets, neg_tweets)
    
    # Split into train and validation sets
    training_fraction = 0.8
    train, val, y_train, y_val = helpers.split_dataset(training_fraction, all_tweets, y)
    
    # Create a labeled text files for supervised FastText
    labeled_filename_full = 'twitter-datasets/full_u_labeled.txt'
    labeled_filename_train = 'twitter-datasets/train_u_labeled.txt'
    labeled_filename_val = 'twitter-datasets/val_u_labeled.txt'

    helpers.write_labeled(labeled_filename_full, all_tweets, y)
    helpers.write_labeled(labeled_filename_train, train, y_train)
    helpers.write_labeled(labeled_filename_val, val, y_val)
    
    unlabeled_filename_full = 'twitter-datasets/full_u_unlabeled.txt'
    unlabeled_filename_train = 'twitter-datasets/train_u_unlabeled.txt'
    unlabeled_filename_val = 'twitter-datasets/val_u_unlabeled.txt'
    
    helpers.write_unlabeled(unlabeled_filename_full, all_tweets)
    helpers.write_unlabeled(unlabeled_filename_train, train)
    helpers.write_unlabeled(unlabeled_filename_val, val)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Evaluating validation performance

In [6]:
import fasttext

# Filenames 
labeled_filename_full = 'twitter-datasets/full_u_labeled.txt'
labeled_filename_train = 'twitter-datasets/train_u_labeled.txt'
labeled_filename_val = 'twitter-datasets/val_u_labeled.txt'


model = fasttext.train_supervised(labeled_filename_train, epoch = 1, dim = 100, wordNgrams = 2, ws = 5, lr = 1)
model.save_model("models/fasttext_model_train_u.bin")

def print_results(N, p, r):
    print("Precision {:.3f}".format(p))
    print("Recall {:.3f}".format(r))

print_results(*model.test(labeled_filename_val))

Precision 0.851
Recall 0.851


In [9]:
import helpers

val_tweets = helpers.txt_to_list(labeled_filename_val)
val_clean_tweets = [el[11:-1] for el in val_tweets]
val_labels = [int(el[9]) for el in val_tweets]

In [10]:
pred = model.predict(val_clean_tweets, k=1)
confidence = [el[0] for el in pred[1]]
res = {'__label__0': 0, '__label__1': 1}
predicted_label = [res[el[0]] for el in pred[0]]

In [11]:
import pandas as pd 

df = pd.DataFrame.from_dict(dict(zip(['tweet', 'label', 'prediction', 'confidence'], [val_clean_tweets, val_labels, predicted_label, confidence])))
df.set_index('tweet', inplace = True)
n_ = 10
print('{:d} most confident correct predictions of positive tweets'.format(n_))
df.query('label == prediction == 1').sort_values(by= 'confidence', ascending = False).head(n_)

10 most confident correct predictions of positive tweets


Unnamed: 0_level_0,label,prediction,confidence
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
#pleasefollowme ),1,1,1.00001
<user> thankyouu ),1,1,1.00001
<user> thank's ),1,1,1.00001
#yougetmajorpointsif you got that sexy smile,1,1,1.00001
<user> thanks ),1,1,1.00001
<user> goodnight thereee ) ;,1,1,1.00001
<user> hello ) pafollow thank you ),1,1,1.00001
#ff <user> ),1,1,1.00001
finally finished ),1,1,1.00001
friday finally ),1,1,1.00001


In [12]:
print('{:d} most confident incorrect predictions of positive tweets'.format(n_))
df.query('label == 1 & prediction == 0').sort_values(by= 'confidence', ascending = False).head(n_)

10 most confident incorrect predictions of positive tweets


Unnamed: 0_level_0,label,prediction,confidence
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
<user> ovestlyyy ( wouldn't misss it .,1,0,1.0
( old thouu . <url>,1,0,0.999983
<user> 14 ( wkwkwk okayy ( tell me by today yah tabb (,1,0,0.999978
pink please ... <url>,1,0,0.999977
<user> ugh,1,0,0.999975
<user> aww poor mummy,1,0,0.999969
miss willams not here,1,0,0.999957
but noo am waching 35 and ticking ( #,1,0,0.999954
"he sad , she sad , every 1 sad !",1,0,0.999939
bored bored bored !,1,0,0.999936


In [13]:
print('{:d} most confident correct predictions of negative tweets'.format(n_))
df.query('label == prediction == 0').sort_values(by= 'confidence', ascending = False).head(n_)

10 most confident correct predictions of negative tweets


Unnamed: 0_level_0,label,prediction,confidence
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
reading the ceiling ( paperback <url>,0,0,1.00001
the bradley method ( paperback paperback book <url>,0,0,1.00001
chinese dictionary ( revised ) ( paperback <url>,0,0,1.00001
<user> ugh ( #uppers,0,0,1.00001
ugh my tummy hurts,0,0,1.00001
<user> so sad,0,0,1.00001
hangin tough ( audio cd <url>,0,0,1.00001
the lover ( paperback <url>,0,0,1.00001
too sad to cry #ripmrsbieber,0,0,1.00001
that'z a really sad quote ( < / 3,0,0,1.00001


In [14]:
print('{:d} most confident incorrect predictions of negative tweets'.format(n_))
df.query('label == 0 & prediction == 1').sort_values(by= 'confidence', ascending = False).head(n_)


10 most confident incorrect predictions of negative tweets


Unnamed: 0_level_0,label,prediction,confidence
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
<user> ),0,1,1.00001
<user> ) love her really ),0,1,1.00001
finally home ),0,1,1.00001
<user> afff ),0,1,1.00001
well alrighty than ),0,1,1.000008
<user> happy 420,0,1,1.000008
thats awesome,0,1,1.000007
watch the boondock > > > _ ),0,1,1.000001
<user> <user> woah ! ),0,1,0.999998
cuddling with <user> #jk #shehatescuddling ),0,1,0.999986


In [15]:
# Train full model
model = fasttext.train_supervised(labeled_filename_full, epoch = 10, dim = 100, wordNgrams = 2, lr = 1)

# Create test set for FastText
test_filename = 'twitter-datasets/test_data.txt'
test_tweets = []
with open(test_filename, encoding = 'utf-8') as f:
    for line in f:
        sp = line.split(',')

        test_tweets.append(','.join(sp[1:])[:-1]) # Remove index and \n
        
# Generate predictions
predictions = np.array([res[el[0]] for el in model.predict(test_tweets, k=1)[0]])

# Save predictions
save_filename = 'submissions/submission_fasttext.csv'
helpers.save_pred(save_filename, predictions)

In [16]:
model.save_model("models/fasttext_model_full_u.bin")

In [17]:
model2 = fasttext.load_model("models/fasttext_model_full_u.bin")
model2.get_sentence_vector('This aint a real tweet')



array([-0.04139576, -0.07969562, -0.14854535, -0.209203  , -0.04080675,
       -0.04067374,  0.0022248 , -0.03968045, -0.24953197, -0.02016892,
       -0.16766147, -0.09062503, -0.07299803, -0.18460302,  0.15368749,
        0.07648121, -0.07046198,  0.08901278, -0.05713234, -0.02254619,
        0.17919917,  0.28306752,  0.18166278, -0.28261054,  0.07417065,
       -0.03832103,  0.12017822, -0.08782136, -0.12535362, -0.20291229,
        0.08289345, -0.05459365, -0.06769999, -0.13456285,  0.20396645,
       -0.12062549, -0.03380526,  0.14870493,  0.03583511,  0.01098658,
       -0.06429038, -0.07257037,  0.05509428,  0.1697615 ,  0.06305418,
        0.16347371,  0.17882311, -0.12726764, -0.14473088, -0.09043496,
       -0.10681625, -0.2274154 , -0.0079439 , -0.04278487, -0.02880332,
       -0.16106004,  0.08497645,  0.02928768, -0.02525478,  0.07026938,
       -0.25062093, -0.12236463,  0.16210869,  0.08052757, -0.2666761 ,
        0.13158643, -0.13980247,  0.01809181, -0.07032972, -0.04