### Creating a FastText labeled dataset

In [1]:
%load_ext autoreload

import numpy as np 
import pandas as pd 
import helpers 

CREATE_NEW_DATASET = False # If need to create a labeled text file for FastText 
if CREATE_NEW_DATASET:
    
    # Load training set
    pos_filename = 'twitter-datasets/train_pos_full_u.txt'
    neg_filename = 'twitter-datasets/train_neg_full_u.txt'
    pos_tweets = helpers.txt_to_list(pos_filename)
    neg_tweets = helpers.txt_to_list(neg_filename)
    
    # Create a labeled dataset 
    all_tweets, y = helpers.merge_shuffle_label(pos_tweets, neg_tweets)
    
    # Split into train and validation sets
    training_fraction = 0.8
    train, val, y_train, y_val = helpers.split_dataset(training_fraction, all_tweets, y)
    
    # Create a labeled text files for supervised FastText
    labeled_filename_full = 'twitter-datasets/full_u_labeled.txt'
    labeled_filename_train = 'twitter-datasets/train_u_labeled.txt'
    labeled_filename_val = 'twitter-datasets/val_u_labeled.txt'

    write_labeled(labeled_filename_full, all_tweets, y)
    write_labeled(labeled_filename_train, train, y_train)
    write_labeled(labeled_filename_val, val, y_val)


### Evaluating validation performance

In [2]:
import fasttext

# Filenames 
labeled_filename_full = 'twitter-datasets/full_u_labeled.txt'
labeled_filename_train = 'twitter-datasets/train_u_labeled.txt'
labeled_filename_val = 'twitter-datasets/val_u_labeled.txt'


model = fasttext.train_supervised(labeled_filename_train, epoch = 1, dim = 100, wordNgrams = 2, ws = 5, lr = 1)

def print_results(N, p, r):
    print("Precision {:.3f}".format(p))
    print("Recall {:.3f}".format(r))

print_results(*model.test(labeled_filename_val))

Precision 0.851
Recall 0.851


In [21]:
import helpers

val_tweets = helpers.txt_to_list(labeled_filename_val)
val_clean_tweets = [el[11:] for el in val_tweets]
val_labels = [int(el[9]) for el in val_tweets]

In [33]:
pred = model.predict(val_clean_tweets, k=1)
confidence = [el[0] for el in pred[1]]
res = {'__label__0': 0, '__label__1': 1}
predicted_label = [res[el[0]] for el in pred[0]]

In [41]:
a = 1
b = 1
a == b == 0

False

In [42]:
import pandas as pd 

df = pd.DataFrame.from_dict(dict(zip(['tweet', 'label', 'prediction', 'confidence'], [val_clean_tweets, val_labels, predicted_label, confidence])))
df.set_index('tweet', inplace = True)
n_ = 10
print('{:d} most confident correct predictions of positive tweets'.format(n_))
df.query('label == prediction == 1').sort_values(by= 'confidence', ascending = False).head(n_)

10 most confident correct predictions of positive tweets


Unnamed: 0_level_0,label,prediction,confidence
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
#yougetmajorpoints if you know how to treat a girl right,1,1,1.00001
#whatnottosaytothepolice i aint a pro ),1,1,1.00001
goodnite / goodmorning twitter,1,1,1.00001
<user> awesome thank you,1,1,1.00001
our next follower will be number 200 ) we thank you in advance !,1,1,1.00001
now watching kick ass . ),1,1,1.00001
hi there,1,1,1.00001
#nf <user> ole pretty self please follow back ),1,1,1.00001
good morning twitter ) 0,1,1,1.00001
<user> congrats on promotion ) x,1,1,1.00001


In [44]:
print('{:d} most confident incorrect predictions of positive tweets'.format(n_))
df.query('label == 1 & prediction == 0').sort_values(by= 'confidence', ascending = False).head(n_)

10 most confident incorrect predictions of positive tweets


Unnamed: 0_level_0,label,prediction,confidence
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
without me,1,0,1.00001
<user> that sucks,1,0,1.000009
i thought this can't be true,1,0,1.000009
damn wishing she wasn't at work i miss her,1,0,1.000008
i just miss you so badly,1,0,1.000008
bored bored bored !,1,0,1.000007
"<user> is sad , poor baby",1,0,1.000006
( about to take a nap,1,0,1.000006
i hate not remembering where i left my water bottle,1,0,1.000004
<user> poor you,1,0,1.0


In [47]:
print('{:d} most confident correct predictions of negative tweets'.format(n_))
df.query('label == prediction == 0').sort_values(by= 'confidence', ascending = False).head(n_)

10 most confident correct predictions of negative tweets


Unnamed: 0_level_0,label,prediction,confidence
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"sport fish of florida ( paperback the angler's practical guide the good , the bad , the ugly--for the very first ... <url>",0,0,1.00001
wish i could enter the 4th dimension again,0,0,1.00001
<user> lies i never see any of yawl you guys abandoned me ( ( (,0,0,1.00001
"enumerative combinatorics , volume 1 ( paperback this book , the first of a two-volume basic introduction to enum ... <url>",0,0,1.00001
haunted travels of michigan : a book and web interactive experience ( paperback take a virtual journey into the ... <url>,0,0,1.00001
"programming perl ( 2nd edition ) ( paperback programming perl , 2nd edition is the authoritative guide to perl ver ... <url>",0,0,1.00001
historic print ( l [ german soldiers firing at attacking french troops from prone position ]: historic print ( l ... <url>,0,0,1.00001
"schooled in murder : a tom and scott mystery ( tom & scott mysteries ) ( hardcover tom mason , chicago area high sc ... <url>",0,0,1.00001
"23x24 custom picture frame / poster frame 1.375 "" wide complete walnut wood frame (8 930 this frame is manufactu ... <url>",0,0,1.00001
i cried when steve irwin died,0,0,1.00001


In [50]:
print('{:d} most confident incorrect predictions of negative tweets'.format(n_))
df.query('label == 0 & prediction == 1').sort_values(by= 'confidence', ascending = False).head(n_)


10 most confident incorrect predictions of negative tweets


Unnamed: 0_level_0,label,prediction,confidence
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
<user> thanx,0,1,1.00001
- this song ),0,1,1.00001
<user> nf <user> u 2 ),0,1,1.00001
<user> thanks megan ! ) x,0,1,1.00001
<user> thankss,0,1,1.000009
<user> lol oh but thanks haha,0,1,1.000009
<user> no worries,0,1,1.000008
<user> ),0,1,1.000007
hey <user> thanks for the ff . happy friday ),0,1,1.000004
gonna love my family we get right together ),0,1,1.000002


In [3]:
# Train full model
model = fasttext.train_supervised(labeled_filename_full, epoch = 10, dim = 100, wordNgrams = 2, lr = 1)

# Create test set for FastText
test_filename = 'twitter-datasets/test_data.txt'
test_tweets = []
with open(test_filename, encoding = 'utf-8') as f:
    for line in f:
        sp = line.split(',')

        test_tweets.append(','.join(sp[1:])[:-1]) # Remove index and \n
        
# Generate predictions
predictions = np.array([res[el[0]] for el in model.predict(test_tweets, k=1)[0]])

# Save predictions
save_filename = 'submissions/submission_fasttext.csv'
helpers.save_pred(save_filename, predictions)