In [1]:
%load_ext autoreload

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
from nltk.tokenize import TweetTokenizer
from sklearn.svm import LinearSVC
import helpers



In [2]:
# Load training set
pos_filename = 'twitter-datasets/train_pos_u.txt'
neg_filename = 'twitter-datasets/train_neg_u.txt'
"""
pos_filename = 'twitter-datasets/train_pos_full_u.txt'
neg_filename = 'twitter-datasets/train_neg_full_u.txt'
"""

pos_tweets = helpers.txt_to_list(pos_filename)
neg_tweets = helpers.txt_to_list(neg_filename)

# Create a labeled dataset 
all_tweets, y = helpers.merge_shuffle_label(pos_tweets, neg_tweets)

# Split into train and validation sets
training_fraction = 0.8
train, val, y_train, y_val = helpers.split_dataset(training_fraction, all_tweets, y)

In [3]:
import preprocessing as pre

pre.process_sentence(pos_tweets[0].split(), pre.preproc_pipeline)

array(['-', '-', '-', 'the', 'type', 'of', 'girl', 'to', 'change', 'your',
       'life'], dtype='<U6')

### Compute TF-IDF

In [6]:
import preprocessor as p

def tk(sent):
    tokens = p.tokenize(sent).split()
    return tokens

def tk2(sent):
    tokens = p.tokenize(sent).split()
    return pre.process_sentence(tokens, pre.preproc_pipeline)

In [9]:
# Build training vectorization 
### tfidf_vectorizer=TfidfVectorizer(use_idf=True, ngram_range = (1,2), tokenizer = TweetTokenizer().tokenize) 
tfidf_vectorizer=TfidfVectorizer(use_idf=True, ngram_range = (1,2), tokenizer = tk2)
X_train = tfidf_vectorizer.fit_transform(train)
X_val = tfidf_vectorizer.transform(val)

### Test a few classifiers on validation set

In [10]:
from sklearn.svm import LinearSVC

clf = LinearSVC(random_state=0, tol=1e-9, loss = 'squared_hinge', dual = True, C = 0.03)
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 84.72% / validation set: 81.39%


In [52]:
df = pd.DataFrame(clf.coef_.flatten(), index=tfidf_vectorizer.get_feature_names(), columns=["coefficient"]) 
n_ = 100
pd.set_option('display.max_rows', 100)

print('{:d} strongest bigrams to indicate positive sentiment'.format(n_))
df.sort_values(by=["coefficient"],ascending=False).head(n_)


100 strongest bigrams to indicate positive sentiment


Unnamed: 0,coefficient
),3.320037
user,2.483102
thanks,1.961815
good,1.910654
cannot wait,1.887231
love,1.81563
haha,1.583769
you,1.511493
lol,1.477279
happy,1.464632


In [53]:
print('{:d} strongest bigrams to indicate negative sentiment'.format(n_))
df.sort_values(by=["coefficient"],ascending=False).tail(n_).sort_values(by=["coefficient"],ascending=True)

100 strongest bigrams to indicate negative sentiment


Unnamed: 0,coefficient
(,-7.857613
[UNK] resource,-5.856171
sad,-3.191328
miss,-2.785188
cannot,-2.076585
sorry,-1.985599
why,-1.912932
wish,-1.879968
hate,-1.849319
did not,-1.787197


In [58]:
df = pd.DataFrame.from_dict(dict(zip(['tweet', 'coefficient', 'label'], [val, clf.decision_function(X_val), y_val])))
n_ = 10
print('{:d} most confident correct predictions of positive tweets'.format(n_))
df.query('label == 1').sort_values(by= 'coefficient', ascending = False).head(n_)

10 most confident correct positive predictions


Unnamed: 0,tweet,coefficient,label
8457,<user> corriii ! )\n,3.000802,1
24754,<user> e7temal abee 3ek wasba 2ek\n,2.517777,1
33903,<user> s'when s'you s'can\n,2.517777,1
1335,<user> wheewww\n,2.517777,1
9943,<user> thankyouthankyouthankyou\n,2.517777,1
9173,<user> wowitscyrus !\n,2.42388,1
14995,<user> holaaa !\n,2.42388,1
4484,<user> if we vote for <user> will you love us ...,2.304681,1
32350,follow <user> for a follow back ! ) ) )\n,2.283133,1
26082,<user> thank you love )\n,2.268742,1


In [62]:
print('{:d} most confident incorrect predictions of positive tweets'.format(n_))
df.query('label == 1').sort_values(by= 'coefficient', ascending = True).head(n_)

10 most confident incorrect predictions of positive tweets


Unnamed: 0,tweet,coefficient,label
35823,<user> okkk ( ( and youre welcome ( (\n,-1.984514,1
28669,i miss that mommenttt\n,-1.821187,1
23245,i really really really really like you <3 33 (...,-1.710164,1
17499,( ( did i miss something ? < _ < )\n,-1.619557,1
31321,i's so pretty at da prom ( ( ( <url>\n,-1.530166,1
22547,the fact that i dont care no more ! ! ( (\n,-1.47744,1
31230,( subject homonyms ( word dew ( assignment use...,-1.471885,1
6874,i miss you more than you miss me cause i know ...,-1.355661,1
4192,( ( ( hush up ! <user> comeon this me you talk...,-1.266385,1
26739,"<user> i'm so sad , i miss you tons ! do you s...",-1.257487,1


In [63]:
print('{:d} most confident correct predictions of negative tweets'.format(n_))
df.query('label == 0').sort_values(by= 'coefficient', ascending = True).head(n_)

10 most confident correct predictions of negative tweets


Unnamed: 0,tweet,coefficient,label
24302,headache please naman ( ( (\n,-3.252306,0
26939,please conwi <user> ( ( (\n,-3.059332,0
16545,so sad cerita the vow tadi ( ( (\n,-2.819726,0
21146,( ( ( rt <user> wifey dont know what a shone i...,-2.748266,0
5707,i really wish i had my phone ... i kinda need ...,-2.722168,0
21561,<user> no ( ( anty balik esk ... monday ada me...,-2.701241,0
10047,i want to be back in bed ( ( (\n,-2.696749,0
21177,<user> i miss her so badly ( (\n,-2.685108,0
12826,can't do this ( (\n,-2.683372,0
34662,i have school tomorrow ( ( (\n,-2.646809,0


In [65]:
print('{:d} most confident incorrect predictions of negative tweets'.format(n_))
df.query('label == 0').sort_values(by= 'coefficient', ascending = False).head(n_)

10 most confident incorrect predictions of negative tweets


Unnamed: 0,tweet,coefficient,label
19148,<user> tobe tobe asdfgh\n,2.517777,0
27714,<user> thanks 7beebtii <3\n,2.05214,0
4105,<user> thank you jessa\n,1.954062,0
3712,<user> thanks bitch :d\n,1.700754,0
7951,<user> thank oo's\n,1.503295,0
19976,<user> lol thank you sir\n,1.432536,0
13517,<user> hey hey be nice .\n,1.273699,0
30578,<user> we can't wait for the new photos to be ...,1.263667,0
32298,<user> hahah i love that tweet .\n,1.239638,0
26423,<user> the mr ng dw make costume for meee . ) ...,1.207022,0


In [6]:
from sklearn.naive_bayes import MultinomialNB


    

clf = MultinomialNB()
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 86.71% / validation set: 81.65%


In [7]:
from sklearn.naive_bayes import BernoulliNB


    

clf = BernoulliNB()
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 80.62% / validation set: 75.64%


In [8]:
from sklearn.naive_bayes import BernoulliNB


clf = BernoulliNB()
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 80.62% / validation set: 75.64%


In [9]:
from sklearn import linear_model

clf =  linear_model.SGDClassifier(loss = 'log', max_iter=int(1e7), tol=1e-5, verbose = False)
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 78.49% / validation set: 78.48%


In [10]:
from sklearn import linear_model

clf =  linear_model.Perceptron()
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 96.80% / validation set: 81.79%


In [12]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=0)  
clf.fit(X_train, y_train)


helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

MemoryError: Unable to allocate 397. MiB for an array with shape (52029241,) and data type float64

### Predictions

In [None]:
# Prepare test set
test_tweets = []
with open('twitter-datasets/test_data.txt', encoding = 'utf-8') as f:
    for line in f:
        sp = line.split(',')
        index = sp[0]
        test_tweets.append(','.join(sp[1:]))
        
# Compute tf-idf on full training set      
tfidf_vectorizer=TfidfVectorizer(use_idf=True, ngram_range = (1,2), tokenizer = TweetTokenizer().tokenize) 
X_train_final = tfidf_vectorizer.fit_transform(all_tweets)
X_test = tfidf_vectorizer.transform(test_tweets)

# Check training accuracy
clf = LinearSVC(random_state=0, tol=1e-9, loss = 'squared_hinge', dual = True, C = 0.03)
clf.fit(X_train_final, y)

train_acc = (clf.predict(X_train_final) == y).mean()
print('Training set accuracy: {:.2f}%'.format(100*train_acc))

In [None]:

# Save predictions
save_filename = 'submissions/submission_tfidf.csv'
predictions = clf.predict(X_test)
helpers.save_pred(save_filename, predictions)

In [None]:
np.save('embeddings/tfidf_unique_full_train.npy', X_train_final)
np.save('embeddings/tfidf_unique_full_labels.npy', y)
np.save('embeddings/tfidf_unique_full_test.npy', X_test)

In [None]:
X_train_final = np.load('embeddings/tfidf_unique_full_train.npy', allow_pickle = True)
y = np.load('embeddings/tfidf_unique_full_labels.npy', allow_pickle = True)
X_test = np.load('embeddings/tfidf_unique_full_test.npy', allow_pickle = True)

In [None]:
X_train_final