In [1]:
%load_ext autoreload

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
from nltk.tokenize import TweetTokenizer
from sklearn.svm import LinearSVC
import helpers

In [2]:
# Load training set

full = True

if full: 
    pos_filename = 'twitter-datasets/train_pos_full_u.txt'
    neg_filename = 'twitter-datasets/train_neg_full_u.txt'
else: 
    pos_filename = 'twitter-datasets/train_pos_u.txt'
    neg_filename = 'twitter-datasets/train_neg_u.txt'


pos_tweets = helpers.txt_to_list(pos_filename)
neg_tweets = helpers.txt_to_list(neg_filename)

# Create a labeled dataset 
all_tweets, y = helpers.merge_shuffle_label(pos_tweets, neg_tweets)

# Split into train and validation sets
training_fraction = 0.8
train, val, y_train, y_val = helpers.split_dataset(training_fraction, all_tweets, y)

In [15]:
train.shape

(1816385,)

In [24]:
train.__getitem__(0)

"<user> i'm so upset your not going this weekend have fun at stage coach\n"

### Compute TF-IDF

In [5]:
import preprocessor as p
from preprocessing import process_sentence, to_vec, split_hashtag, words_to_tags, remove_repeats, detect_laugh

def tk(sent):
    tokens = p.tokenize(sent).split()
    return tokens

def tk2(sent):
    tokens = p.tokenize(sent).split()
    return pre.process_sentence(tokens, pre.preproc_pipeline)

preproc_pipeline = [to_vec(split_hashtag), 
                   to_vec(words_to_tags), 
                   to_vec(remove_repeats),
                   to_vec(detect_laugh)]

def tk3(sent):
    tokens = TweetTokenizer().tokenize(sent)
    return process_sentence(tokens, preproc_pipeline)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Matteo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Matteo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [7]:
# Build training vectorization 
### tfidf_vectorizer=TfidfVectorizer(use_idf=True, ngram_range = (1,2), tokenizer = TweetTokenizer().tokenize) 
tfidf_vectorizer=TfidfVectorizer(use_idf=True, ngram_range = (1,2), tokenizer = tk3)
X_train = tfidf_vectorizer.fit_transform(train)
X_val = tfidf_vectorizer.transform(val)

In [31]:
X_train.shape

(1816385, 4557026)

In [35]:
# Calculate the number of valid entries in each row
non_zero_by_row = np.diff(X_train.tocsr().indptr)
non_zero_by_row

array([27, 58, 31, ..., 26, 34, 39], dtype=int32)

In [36]:
non_zero_by_row.max()

84

In [28]:
np.array(tfidf_vectorizer.get_feature_names())[[ 617107,  633072,  798423,  807566, 1233576, 1825515, 1825657,
        1913157, 1914351, 2015694, 2018390, 2160663, 2164629, 2931135,
        2933905, 3662285, 3668963, 3736351, 3736443, 4003422, 4013937,
        4219311, 4219643, 4338174, 4338656, 4523796, 4530335]]

array(['alice', "alice i'm", 'at', 'at stage', 'coach', 'fun', 'fun at',
       'going', 'going this', 'have', 'have fun', "i'm", "i'm so", 'not',
       'not going', 'so', 'so upset', 'stage', 'stage coach', 'this',
       'this wekend', 'upset', 'upset your', 'wekend', 'wekend have',
       'your', 'your not'], dtype='<U163')

In [43]:
np.nonzero(tfidf_vectorizer.transform(["i am very sad about the outcome of the election"]).toarray())

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64),
 array([ 541248,  546111,  671766,  674115, 1565513, 2145806, 2146282,
        2971121, 2993009, 3083452, 3083476, 3462241, 3462338, 3922041,
        3935086, 3950911, 4251646, 4252913], dtype=int64))

In [45]:
np.array(tfidf_vectorizer.get_feature_names())[[ 541248,  546111,  671766,  674115, 1565513, 2145806, 2146282,
        2971121, 2993009, 3083452, 3083476, 3462241, 3462338, 3922041,
        3935086, 3950911, 4251646, 4252913]]

array(['about', 'about the', 'am', 'am very', 'election', 'i', 'i am',
       'of', 'of the', 'outcome', 'outcome of', 'sad', 'sad about', 'the',
       'the election', 'the outcome', 'very', 'very sad'], dtype='<U163')

### Test a few classifiers on validation set

In [5]:
from sklearn.svm import LinearSVC

clf = LinearSVC(random_state=0, tol=1e-9, loss = 'squared_hinge', dual = True, C = 0.03)
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 86.57% / validation set: 85.11%


In [6]:
df = pd.DataFrame(clf.coef_.flatten(), index=tfidf_vectorizer.get_feature_names(), columns=["coefficient"]) 
n_ = 100
pd.set_option('display.max_rows', 100)

print('{:d} strongest bigrams to indicate positive sentiment'.format(n_))
df.sort_values(by=["coefficient"],ascending=False).head(n_)


100 strongest bigrams to indicate positive sentiment


Unnamed: 0,coefficient
),6.530061
( (,5.821362
( >,3.755154
can't wait,3.350554
> >,3.019469
thanks,2.823874
no problem,2.531068
cant wait,2.525956
smile,2.511491
hapy,2.375419


In [7]:
print('{:d} strongest bigrams to indicate negative sentiment'.format(n_))
df.sort_values(by=["coefficient"],ascending=False).tail(n_).sort_values(by=["coefficient"],ascending=True)

100 strongest bigrams to indicate negative sentiment


Unnamed: 0,coefficient
(,-14.557267
... [UNK],-10.545662
sad,-5.551473
mis,-4.22663
) ),-3.69416
wah,-3.612889
por,-3.609854
cry,-3.431948
crying,-3.282803
can't,-3.150341


In [8]:
df = pd.DataFrame.from_dict(dict(zip(['tweet', 'coefficient', 'label'], [val, clf.decision_function(X_val), y_val])))
df.set_index('tweet', inplace = True)
n_ = 10
print('{:d} most confident correct predictions of positive tweets'.format(n_))
df.query('label == 1').sort_values(by= 'coefficient', ascending = False).head(n_)

10 most confident correct predictions of positive tweets


Unnamed: 0_level_0,coefficient,label
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1
fucklovesexmoneyonly ),6.686995,1
<user> thanks jirah ! ),4.656479,1
<user> thanks dianamite ! ),4.656479,1
<user> coool ! ) sure sure ! ) hahaha thanks ys ),4.050394,1
<user> thank youuu ! ) you too ! ),4.007262,1
<user> owkej thanks,3.539907,1
<user> haha ) yeahh . ilove you more ) lol you are more amazing and sweet ),3.446162,1
<user> <user> thank you ! ) ),3.429393,1
<user> thanks for follow me ) nice to know you ),3.408739,1
<user> thank you ),3.402971,1


In [9]:
print('{:d} most confident incorrect predictions of positive tweets'.format(n_))
df.query('label == 1').sort_values(by= 'coefficient', ascending = True).head(n_)

10 most confident incorrect predictions of positive tweets


Unnamed: 0_level_0,coefficient,label
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1
the weeknd-thursday (,-10.5153,1
ohheythere ... <url>,-6.544099,1
<user> i know (,-3.195289,1
i want to cry,-2.718442,1
<user> ohh (,-2.692335,1
<user> miss you more (,-2.687864,1
<user> i want some (,-2.540041,1
its only tuesday damn ... (,-2.516003,1
its 5:11 ( #itsanalainerthing,-2.47946,1
justkidding no he didn't (,-2.463262,1


In [10]:
print('{:d} most confident correct predictions of negative tweets'.format(n_))
df.query('label == 0').sort_values(by= 'coefficient', ascending = True).head(n_)

10 most confident correct predictions of negative tweets


Unnamed: 0_level_0,coefficient,label
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1
really sad i had to miss the #gsqaud game (,-4.745569,0
it's cold in here ( ( and i'm all alone ( ( i'm cold : ( ( ( guise i can't breathe ( i hate rummm !,-4.489991,0
ughh i hate this (,-4.317277,0
i miss this (,-4.14428,0
rt <user> ( ( ( sad until cannot sad fml,-4.119435,0
<user> omg come cuddle me i'm so sad that i can't go (,-4.097499,0
<user> that's so sad ( i'm sorry !,-4.062759,0
<user> wahhh i miss you more ( we haven't talked in forever . so sad,-3.996591,0
<user> when will you notice me ? i'm so sad (,-3.996374,0
miss my bed .. wish i was home (,-3.946296,0


In [11]:
print('{:d} most confident incorrect predictions of negative tweets'.format(n_))
df.query('label == 0').sort_values(by= 'coefficient', ascending = False).head(n_)

10 most confident incorrect predictions of negative tweets


Unnamed: 0_level_0,coefficient,label
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1
<user> thanks ),3.272382,0
<user> ),3.103269,0
hey <user> thanks for the ff . happy friday ),2.817429,0
<user> happy birthday to him of course ),2.459054,0
<user> thanks megan ! ) x,2.382431,0
<user> stoppostingpictureswiththatdeliciouscupcakeoriwilllbiteyoubecauseitlookssoawesomeandiwanttoeatit,2.341095,0
<user> #dunnowhereuare,2.341095,0
mishing <user>,2.341095,0
<user>,2.341095,0
<user> twitteameee,2.341095,0


In [12]:
from sklearn.naive_bayes import MultinomialNB


    

mdb = MultinomialNB()
mdb.fit(X_train, y_train)

helpers.judge_pred(mdb, X_train, X_val, y_train, y_val)

Training set accuracy: 86.35% / validation set: 81.52%


In [7]:
from sklearn.naive_bayes import BernoulliNB


    

clf = BernoulliNB()
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 80.62% / validation set: 75.64%


In [8]:
from sklearn.naive_bayes import BernoulliNB


clf = BernoulliNB()
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 80.62% / validation set: 75.64%


In [9]:
from sklearn import linear_model

clf =  linear_model.SGDClassifier(loss = 'log', max_iter=int(1e7), tol=1e-5, verbose = False)
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 78.49% / validation set: 78.48%


In [10]:
from sklearn import linear_model

clf =  linear_model.Perceptron()
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 96.80% / validation set: 81.79%


In [12]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=0)  
clf.fit(X_train, y_train)


helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

MemoryError: Unable to allocate 397. MiB for an array with shape (52029241,) and data type float64

### Predictions

In [13]:
# Prepare test set
test_tweets = []
with open('twitter-datasets/test_data.txt', encoding = 'utf-8') as f:
    for line in f:
        sp = line.split(',')
        index = sp[0]
        test_tweets.append(','.join(sp[1:]))
        
# Compute tf-idf on full training set      
X_train_final = tfidf_vectorizer.fit_transform(all_tweets)
X_test = tfidf_vectorizer.transform(test_tweets)

# Check training accuracy
clf = LinearSVC(random_state=0, tol=1e-9, loss = 'squared_hinge', dual = True, C = 0.03)
clf.fit(X_train_final, y)

train_acc = (clf.predict(X_train_final) == y).mean()
print('Training set accuracy: {:.2f}%'.format(100*train_acc))

Training set accuracy: 86.68%


In [14]:

# Save predictions
save_filename = 'submissions/submission_tfidf.csv'
predictions = clf.predict(X_test)
helpers.save_pred(save_filename, predictions)

In [None]:
np.save('embeddings/tfidf_unique_full_train.npy', X_train_final)
np.save('embeddings/tfidf_unique_full_labels.npy', y)
np.save('embeddings/tfidf_unique_full_test.npy', X_test)

In [None]:
X_train_final = np.load('embeddings/tfidf_unique_full_train.npy', allow_pickle = True)
y = np.load('embeddings/tfidf_unique_full_labels.npy', allow_pickle = True)
X_test = np.load('embeddings/tfidf_unique_full_test.npy', allow_pickle = True)

In [None]:
X_train_final