### Creating a FastText labeled dataset

In [4]:
import numpy as np 
import pandas as pd 
import fasttext_models as mod
import os 
import wget
import fasttext

root = 'data/'
os.makedirs(root, exist_ok=True)

seed = 0

CREATE_NEW_DATASET = True # If need to create a labeled text file for FastText 
if CREATE_NEW_DATASET:
    
    # Download negative full
    neg_url = 'https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDQ0eDZMdDI5WXBlVXYyZGc_ZT1ZZDJn/root/content'
    neg_filename = root + 'train_neg_full_u.txt'
    wget.download(neg_url, neg_filename)
    neg_tweets = mod.txt_to_list(neg_filename)

    # Download positive full
    pos_url = 'https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDQzcTc3QmNPbUdIWHQ3TXc_ZT01ejdG/root/content'
    pos_filename = root + 'train_pos_full_u.txt'
    wget.download(pos_url, pos_filename)
    pos_tweets = mod.txt_to_list(pos_filename)
    
    # Create a labeled dataset 
    all_tweets, y = mod.merge_shuffle_label(pos_tweets, neg_tweets, seed = seed)
    
    # Split into train and validation sets
    training_fraction = 0.95
    train, val, y_train, y_val = mod.split_dataset(training_fraction, all_tweets, y)
    
    # Create a labeled text files for supervised FastText
    labeled_filename_full = root + 'full_u_labeled.txt'
    labeled_filename_train = root + 'train_u_labeled.txt'
    labeled_filename_val = root + 'val_u_labeled.txt'

    mod.write_labeled(labeled_filename_full, all_tweets, y)
    mod.write_labeled(labeled_filename_train, train, y_train)
    mod.write_labeled(labeled_filename_val, val, y_val)
    
    unlabeled_filename_full = root + 'full_u_unlabeled.txt'
    unlabeled_filename_train = root + 'train_u_unlabeled.txt'
    unlabeled_filename_val = root + 'val_u_unlabeled.txt'
    
    mod.write_unlabeled(unlabeled_filename_full, all_tweets)
    mod.write_unlabeled(unlabeled_filename_train, train)
    mod.write_unlabeled(unlabeled_filename_val, val)





### Evaluating validation performance

In [5]:
# Filenames 
unlabeled_filename_full = root + 'full_u_unlabeled.txt'
labeled_filename_full = root + 'full_u_labeled.txt'
unlabeled_filename_train = root + 'train_u_unlabeled.txt'
labeled_filename_train = root + 'train_u_labeled.txt'
unlabeled_filename_val = root + 'val_u_unlabeled.txt'
labeled_filename_val = root + 'val_u_labeled.txt'

model = fasttext.train_supervised(labeled_filename_train, epoch = 3, dim = 100, wordNgrams = 2, ws = 5, lr = 1)

def print_results(N, p, r):
    """ Print accuracy and recall (taken from fasttext documentation )"""
    print("Precision {:.3f}".format(p))
    print("Recall {:.3f}".format(r))

print_results(*model.test(labeled_filename_val))

Precision 0.857
Recall 0.857


### Inspecting easy & hard tweets to classify for both categories

In [7]:
# Load validation tweets and labels
val_tweets = mod.txt_to_list(labeled_filename_val)
val_clean_tweets = [el[11:-1] for el in val_tweets]
val_labels = [int(el[9]) for el in val_tweets]

# Make predictions and evaluate confidence
pred = model.predict(val_clean_tweets, k=1)
confidence = [el[0] for el in pred[1]]
res = {'__label__0': 0, '__label__1': 1}
predicted_label = [res[el[0]] for el in pred[0]]

### Most confident correct predictions of positive tweets

In [8]:
df = pd.DataFrame.from_dict(dict(zip(['tweet', 'label', 'prediction', 'confidence'], [val_clean_tweets, val_labels, predicted_label, confidence])))
df.set_index('tweet', inplace = True)
n_ = 10
print('{:d} most confident correct predictions of positive tweets'.format(n_))
df.query('label == prediction == 1').sort_values(by= 'confidence', ascending = False).head(n_)

10 most confident correct predictions of positive tweets


Unnamed: 0_level_0,label,prediction,confidence
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
#yougetmajorpointsif you enjoy the little things in life,1,1,1.00001
<user> awww thanks ),1,1,1.00001
#yougetmajorpointsif you wear boots and wranglers,1,1,1.00001
#yougetmajorpointsif you have a vagina,1,1,1.00001
#yougetmajorpointsif you can kiss and hug good,1,1,1.00001
#yougetmajorpointsif you have that sexual side,1,1,1.00001
#yougetmajorpointsif you can fade my personality,1,1,1.00001
#yougetmajorpointsif you can make me laugh / smile,1,1,1.00001
loving these #yougetmajorpointsif,1,1,1.00001
singing #ifindthatattractive,1,1,1.00001


### Most confident incorrect predictions of positive tweets

In [9]:
print('{:d} most confident incorrect predictions of positive tweets'.format(n_))
df.query('label == 1 & prediction == 0').sort_values(by= 'confidence', ascending = False).head(n_)

10 most confident incorrect predictions of positive tweets


Unnamed: 0_level_0,label,prediction,confidence
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
to the funeral,1,0,1.000008
<user> harsh third wheel #foreveralone,1,0,1.000008
ugh definitly not feeling it today,1,0,0.999985
#ifiwasjustinforadayiwould probably commit suicide,1,0,0.999961
<user> really wish i wasnt,1,0,0.999885
parents are gone,1,0,0.999825
wish i never got annoyed so easy . hate wee love me boots,1,0,0.999782
i wish my friends didn't work all the time . i need a job,1,0,0.999707
<user> wah wah wah . go call the wah-mbulence ... #baby,1,0,0.999705
<user> i wanna i asked char to come with me this weekend but he can't #sad,1,0,0.999621


### Most confident correct predictions of negative tweets

In [10]:
print('{:d} most confident correct predictions of negative tweets'.format(n_))
df.query('label == prediction == 0').sort_values(by= 'confidence', ascending = False).head(n_)

10 most confident correct predictions of negative tweets


Unnamed: 0_level_0,label,prediction,confidence
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
the big book of garden hens ( hardcover <url>,0,0,1.00001
i hate nights like these #sadtweet,0,0,1.00001
i hate pins and needles . waaah,0,0,1.00001
student of ancient chinese dictionary ( revised ) ( hardcover <url>,0,0,1.00001
a guide to the architecture of st . louis ( paperback <url>,0,0,1.00001
<user> that sounds so sad,0,0,1.00001
my throat is hurting (,0,0,1.00001
crowned in terror ( audio cd <url>,0,0,1.00001
seasons in the abyss ( audio cd <url>,0,0,1.00001
what a difference a day makes ( paperback <url>,0,0,1.00001


### Most confident incorrect predictions of negative tweets

In [11]:
print('{:d} most confident incorrect predictions of negative tweets'.format(n_))
df.query('label == 0 & prediction == 1').sort_values(by= 'confidence', ascending = False).head(n_)


10 most confident incorrect predictions of negative tweets


Unnamed: 0_level_0,label,prediction,confidence
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
<user> thanks ),0,1,1.00001
assets liabilities equity ),0,1,1.000002
20 ; 20 ) to bem sim ) ) ),0,1,0.99999
"<user> thank you weyhh , amin amin amin",0,1,0.999989
ooh itny thory #ff,0,1,0.999967
<user> i mentioned in a previous tweet ),0,1,0.999884
<user> its hilarious,0,1,0.99973
<user> ... lls but thanks,0,1,0.999705
s / o to <user> for thee followback,0,1,0.999698
<user> heyy chill it's not much of a big deal ),0,1,0.999686
