In [4]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import linear_model
import tfidf_models as mod
import os 
import wget

seed = 0
root = 'data/'
os.makedirs(root, exist_ok=True) 

In [2]:
# Load training set
full = True # Whether to use full or small dataset

if full: 
    # Download negative full
    neg_url = 'https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDQ0eDZMdDI5WXBlVXYyZGc_ZT1ZZDJn/root/content'
    neg_filename = root + 'train_neg_full_u.txt'
    wget.download(neg_url, neg_filename)
    # Download positive full
    pos_url = '!wget https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDQzcTc3QmNPbUdIWHQ3TXc_ZT01ejdG/root/content'
    pos_filename = root + 'train_pos_full_u.txt'
    wget.download(pos_url, pos_filename)
else: 
    # Download negative small
    neg_url = 'https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDQyeURtYWFXMzZoMnVEeGc_ZT1IMnhQ/root/content'
    neg_filename = root + 'train_neg_small_u.txt'
    wget.download(neg_url, neg_filename)
    # Download positive small
    pos_url = 'https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDQxYUNPOENKdTBrX19hY2c_ZT1WNW5Y/root/content'
    pos_filename = root + 'train_pos_small_u.txt'
    wget.download(pos_url, pos_filename)
    

# Load corresponding text files
pos_tweets = mod.txt_to_list(pos_filename)
neg_tweets = mod.txt_to_list(neg_filename)

# Shuffle and keep track of labels 
all_tweets, y = mod.merge_shuffle_label(pos_tweets, neg_tweets, seed)

# Split into train and validation sets
training_fraction = 0.9
train, val, y_train, y_val = mod.split_dataset(training_fraction, all_tweets, y)

### Compute TF-IDF

In [4]:
# Build training vectorization 
TFIDF = True # Else use a bag of words representation

if TFIDF:
    vect = TfidfVectorizer(use_idf=True, ngram_range = (1,2), tokenizer = mod.tk, sublinear_tf = True)
    X_train = vect.fit_transform(train)
    X_val = vect.transform(val)
else:
    vect = CountVectorizer(ngram_range = (1,2), tokenizer = mod.tk)
    X_train = vect.fit_transform(train)
    X_val = vect.transform(val)

## Comparing several classifiers on validation set

In [5]:
svc = LinearSVC(random_state=0, tol=1e-9, loss = 'squared_hinge', dual = True, C = 0.03)
svc.fit(X_train, y_train)

mod.judge_pred(svc, X_train, X_val, y_train, y_val)

Training set accuracy: 86.69% / validation set: 85.12%


In [53]:
mdb = MultinomialNB()
mdb.fit(X_train, y_train)

mod.judge_pred(mdb, X_train, X_val, y_train, y_val)

Training set accuracy: 86.29% / validation set: 81.59%


In [54]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train)

mod.judge_pred(bnb, X_train, X_val, y_train, y_val)

Training set accuracy: 80.20% / validation set: 75.75%


In [13]:
sgdc =  linear_model.SGDClassifier(loss = 'modified_huber', tol=1e-9, alpha = 0.0001, verbose = False)
sgdc.fit(X_train, y_train)

mod.judge_pred(sgdc, X_train, X_val, y_train, y_val)

Training set accuracy: 83.71% / validation set: 83.25%


### Most useful tokens or bigrams to predict sentiment

In [6]:
clf = svc # Choose classifier with best performance on validation set

df = pd.DataFrame(svc.coef_.flatten(), index=vect.get_feature_names(), columns=["coefficient"]) 

n_ = 10 # Number of tokens to show
pd.set_option('display.max_rows', n_)

print('{:d} strongest bigrams to indicate positive sentiment:'.format(n_))
df.sort_values(by=["coefficient"],ascending=False).head(n_)


10 strongest bigrams to indicate positive sentiment:


Unnamed: 0,coefficient
),6.633215
( (,4.023364
( >,3.974658
can't wait,3.368455
> >,3.081738
thanks,2.82497
smile,2.599727
no problem,2.597634
cant wait,2.559933
hapy,2.426204


In [7]:
print('{:d} strongest bigrams to indicate negative sentiment:'.format(n_))
df.sort_values(by=["coefficient"],ascending=False).tail(n_).sort_values(by=["coefficient"],ascending=True)

10 strongest bigrams to indicate negative sentiment:


Unnamed: 0,coefficient
(,-15.245847
... <url>,-10.139855
sad,-5.654967
mis,-4.34151
por,-3.733259
wah,-3.633903
cry,-3.520836
crying,-3.395575
(8,-3.307706
sucks,-3.200722


### Most confident predictions

In [8]:
df = pd.DataFrame.from_dict(dict(zip(['tweet', 'coefficient', 'label'], [val, clf.decision_function(X_val), y_val])))
df.set_index('tweet', inplace = True)
n_ = 10
print('{:d} most confident correct predictions of positive tweets'.format(n_))
df.query('label == 1').sort_values(by= 'coefficient', ascending = False).head(n_)

10 most confident correct predictions of positive tweets


Unnamed: 0_level_0,coefficient,label
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1
<user> thank you too ! )\n,3.878983,1
<user> thank you ! )\n,3.868325,1
<user> thank you cheegu ) )\n,3.816665,1
<user> thank you ! ! )\n,3.611729,1
<user> thank you . )\n,3.555496,1
<user> owkej thanks\n,3.536475,1
<user> hey thank you ! ) )\n,3.414597,1
happy birthday <user> all the best ! )\n,3.413621,1
<user> thaankk you lovee . ! )\n,3.380231,1
<user> thank you ! ! ! )\n,3.326548,1


In [9]:
print('{:d} most confident incorrect predictions of positive tweets'.format(n_))
df.query('label == 1').sort_values(by= 'coefficient', ascending = True).head(n_)

10 most confident incorrect predictions of positive tweets


Unnamed: 0_level_0,coefficient,label
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1
seeing niall cry is so sad\n,-2.384333,1
i lost my phon ( no seriously i did where is it omg ; _\n,-2.365413,1
<user> what the pqo ? (\n,-2.303933,1
<user> why the sad face ? ! x\n,-2.141454,1
sleepy all the day fatni elarb 3a2 : ' ' ' ( < / / / 3\n,-2.110856,1
<user> i love youuu so much i neeed to see u again ! why did sunday have to end i wish i could go back to that day ! ! xxx\n,-2.06061,1
"gonna do this while i really want to do this : "" "" "" (\n",-2.003104,1
<user> follow me please (\n,-1.971156,1
<user> aww poor mummy\n,-1.968364,1
<user> aw damn it . sorry i can't eat my teeth hurts a lot ! next time\n,-1.962519,1


In [10]:
print('{:d} most confident correct predictions of negative tweets'.format(n_))
df.query('label == 0').sort_values(by= 'coefficient', ascending = True).head(n_)

10 most confident correct predictions of negative tweets


Unnamed: 0_level_0,coefficient,label
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1
the lastborn of elvinwood ( paperback <url>\n,-4.558267,0
i have a weadacheee ( ( wahhh\n,-4.258736,0
i miss <user> (\n,-4.25665,0
mis chichas\n,-4.1759,0
<user> when will you notice me ? i'm so sad (\n,-4.063822,0
this song is so sad ... #ripzack (\n,-3.98216,0
ughhh my tummy hurts really bad ( (\n,-3.968909,0
<user> didn't tweet me and i'm so sad ( (\n,-3.937913,0
<user> <user> <user> <user> <user> i miss you guys . ( ( i'm so sad . i want to just cry or something .\n,-3.861699,0
i feeel sooo bad for her . like its sooo sad ( ( ( aww\n,-3.857192,0


In [11]:
print('{:d} most confident incorrect predictions of negative tweets'.format(n_))
df.query('label == 0').sort_values(by= 'coefficient', ascending = False).head(n_)

10 most confident incorrect predictions of negative tweets


Unnamed: 0_level_0,coefficient,label
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1
<user> to machucada ) )\n,2.901979,0
<user> whatssofunny !\n,2.493353,0
<user> puxis\n,2.310252,0
<user> stoppostingpictureswiththatdeliciouscupcakeoriwilllbiteyoubecauseitlookssoawesomeandiwanttoeatit\n,2.310252,0
<user> #thatawkwardmomentwhenaugieinternettakesaneternitytoloadthevideo\n,2.310252,0
<user> wheheheheh\n,2.310252,0
<user> mthatha\n,2.310252,0
<user> ayudameee\n,2.310252,0
<user> sakina almskeenaelly btdawm\n,2.310252,0
"rt "" <user> happy birthday to #oomf <user> ) "" my favorite smiley !\n",2.130176,0
