In [12]:
%load_ext autoreload

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
from nltk.tokenize import TweetTokenizer
from sklearn.svm import LinearSVC
import helpers



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
# Load training set

full = True

if full: 
    pos_filename = 'twitter-datasets/train_pos_full_u.txt'
    neg_filename = 'twitter-datasets/train_neg_full_u.txt'
else: 
    pos_filename = 'twitter-datasets/train_pos_u.txt'
    neg_filename = 'twitter-datasets/train_neg_u.txt'


pos_tweets = helpers.txt_to_list(pos_filename)
neg_tweets = helpers.txt_to_list(neg_filename)

# Create a labeled dataset 
all_tweets, y = helpers.merge_shuffle_label(pos_tweets, neg_tweets)

# Split into train and validation sets
training_fraction = 0.8
train, val, y_train, y_val = helpers.split_dataset(training_fraction, all_tweets, y)

### Compute TF-IDF

In [24]:
import preprocessor as p

def tk(sent):
    tokens = p.tokenize(sent).split()
    return tokens

def tk2(sent):
    tokens = p.tokenize(sent).split()
    return pre.process_sentence(tokens, pre.preproc_pipeline)

In [25]:
# Build training vectorization 
### tfidf_vectorizer=TfidfVectorizer(use_idf=True, ngram_range = (1,2), tokenizer = TweetTokenizer().tokenize) 
tfidf_vectorizer=TfidfVectorizer(use_idf=True, ngram_range = (1,2), tokenizer = tk)
X_train = tfidf_vectorizer.fit_transform(train)
X_val = tfidf_vectorizer.transform(val)

### Test a few classifiers on validation set

In [26]:
from sklearn.svm import LinearSVC

clf = LinearSVC(random_state=0, tol=1e-9, loss = 'squared_hinge', dual = True, C = 0.03)
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 86.29% / validation set: 84.91%


In [27]:
df = pd.DataFrame(clf.coef_.flatten(), index=tfidf_vectorizer.get_feature_names(), columns=["coefficient"]) 
n_ = 100
pd.set_option('display.max_rows', 100)

print('{:d} strongest bigrams to indicate positive sentiment'.format(n_))
df.sort_values(by=["coefficient"],ascending=False).head(n_)


100 strongest bigrams to indicate positive sentiment


Unnamed: 0,coefficient
),6.451487
( (,5.496134
( >,3.664865
can't wait,3.245022
> >,2.96717
thanks,2.870058
good,2.619271
smile,2.522988
happy,2.412821
cant wait,2.403861


In [28]:
print('{:d} strongest bigrams to indicate negative sentiment'.format(n_))
df.sort_values(by=["coefficient"],ascending=False).tail(n_).sort_values(by=["coefficient"],ascending=True)

100 strongest bigrams to indicate negative sentiment


Unnamed: 0,coefficient
(,-14.028547
... <url>,-10.217622
sad,-5.647014
miss,-4.303458
poor,-3.767273
) ),-3.612997
$SMILEY$ ),-3.561889
cry,-3.451833
crying,-3.242551
sucks,-3.199294


In [29]:
df = pd.DataFrame.from_dict(dict(zip(['tweet', 'coefficient', 'label'], [val, clf.decision_function(X_val), y_val])))
df.set_index('tweet', inplace = True)
n_ = 10
print('{:d} most confident correct predictions of positive tweets'.format(n_))
df.query('label == 1').sort_values(by= 'coefficient', ascending = False).head(n_)

10 most confident correct predictions of positive tweets


Unnamed: 0_level_0,coefficient,label
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1
<user> thanks daylan ! ),4.600689,1
<user> thanks jirah ! ),4.600689,1
<user> thanks mcsnake ! ),4.600689,1
<user> thanks ! ),3.799298,1
from awesomepictures.me ) <url>,3.691281,1
<user> thank you cheegu ) ),3.595871,1
<user> follbackyaa thanks,3.556296,1
<user> awwuuh thanks,3.556296,1
<user> huhrhurhurhur thanks,3.556296,1
<user> thank you . ),3.484261,1


In [30]:
print('{:d} most confident incorrect predictions of positive tweets'.format(n_))
df.query('label == 1').sort_values(by= 'coefficient', ascending = True).head(n_)

10 most confident incorrect predictions of positive tweets


Unnamed: 0_level_0,coefficient,label
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1
wingnthings ( (,-5.434482,1
<user> 14 ( wkwkwk okayy ( tell me by today yah tabb (,-2.863263,1
gheheheh <user> such a sad story ... but ... <url>,-2.692928,1
<user> miss you more (,-2.651623,1
justkidding no he didn't (,-2.392851,1
<user> ughhh :/ i wish i did but i got home at 8: 30 and it was closed . i miss workk < / 3 working saturday right ?,-2.386928,1
"rt <user> so .. for everbody that hurt , sad , mad , cry , feel bad , etc . because of me .. please forgive me : "" ( no body perfec",-2.384424,1
<user> aww really (,-2.357661,1
<user> thanks gutted i missed glee though . not gutted i missed rachel though #stuckuphorse,-2.345663,1
i wanna talk to #oomf ( ( (,-2.317131,1


In [31]:
print('{:d} most confident correct predictions of negative tweets'.format(n_))
df.query('label == 0').sort_values(by= 'coefficient', ascending = True).head(n_)

10 most confident correct predictions of negative tweets


Unnamed: 0_level_0,coefficient,label
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1
wish i was at aviciii (,-4.966417,0
<user> sigghhhss ( (,-4.892636,0
rt <user> nfl draft 1st rd by state : tx ( 5 tn ( 3 al ( 2 ca ( 2 fl ( 2 ok ( 2 oh ( 2 11 states with ( 1 ),-4.851107,0
it's cold in here ( ( and i'm all alone ( ( i'm cold : ( ( ( guise i can't breathe ( i hate rummm !,-4.694607,0
the lastborn of elvinwood ( paperback <url>,-4.361577,0
some other countries in this low category include haiti ( 0.454 zimbabwe ( 0.376 sudan ( 0.408 sierra leone ( ... <url>,-4.16741,0
woke up early . ( 7 hours omg (,-4.033661,0
this movie is making me cry < / 3 ( (,-4.003928,0
<user> i wish i was ( (,-3.965345,0
i miss them (,-3.931083,0


In [32]:
print('{:d} most confident incorrect predictions of negative tweets'.format(n_))
df.query('label == 0').sort_values(by= 'coefficient', ascending = False).head(n_)

10 most confident incorrect predictions of negative tweets


Unnamed: 0_level_0,coefficient,label
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1
<user> thank you ),3.452199,0
<user> ),3.068503,0
<user> 7abeeebty thank you,2.910433,0
<user> happy birthday to him of course ),2.31908,0
<user> twitteameee,2.288357,0
<user> i-d-i-o-t-a,2.288357,0
<user> mthatha,2.288357,0
<user> chuta,2.288357,0
<user> thanks #miss you,2.162552,0
<user> thanks rhyno,2.129335,0


In [6]:
from sklearn.naive_bayes import MultinomialNB


    

clf = MultinomialNB()
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 86.71% / validation set: 81.65%


In [7]:
from sklearn.naive_bayes import BernoulliNB


    

clf = BernoulliNB()
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 80.62% / validation set: 75.64%


In [8]:
from sklearn.naive_bayes import BernoulliNB


clf = BernoulliNB()
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 80.62% / validation set: 75.64%


In [9]:
from sklearn import linear_model

clf =  linear_model.SGDClassifier(loss = 'log', max_iter=int(1e7), tol=1e-5, verbose = False)
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 78.49% / validation set: 78.48%


In [10]:
from sklearn import linear_model

clf =  linear_model.Perceptron()
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 96.80% / validation set: 81.79%


In [12]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=0)  
clf.fit(X_train, y_train)


helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

MemoryError: Unable to allocate 397. MiB for an array with shape (52029241,) and data type float64

### Predictions

In [None]:
# Prepare test set
test_tweets = []
with open('twitter-datasets/test_data.txt', encoding = 'utf-8') as f:
    for line in f:
        sp = line.split(',')
        index = sp[0]
        test_tweets.append(','.join(sp[1:]))
        
# Compute tf-idf on full training set      
tfidf_vectorizer=TfidfVectorizer(use_idf=True, ngram_range = (1,2), tokenizer = TweetTokenizer().tokenize) 
X_train_final = tfidf_vectorizer.fit_transform(all_tweets)
X_test = tfidf_vectorizer.transform(test_tweets)

# Check training accuracy
clf = LinearSVC(random_state=0, tol=1e-9, loss = 'squared_hinge', dual = True, C = 0.03)
clf.fit(X_train_final, y)

train_acc = (clf.predict(X_train_final) == y).mean()
print('Training set accuracy: {:.2f}%'.format(100*train_acc))

In [None]:

# Save predictions
save_filename = 'submissions/submission_tfidf.csv'
predictions = clf.predict(X_test)
helpers.save_pred(save_filename, predictions)

In [None]:
np.save('embeddings/tfidf_unique_full_train.npy', X_train_final)
np.save('embeddings/tfidf_unique_full_labels.npy', y)
np.save('embeddings/tfidf_unique_full_test.npy', X_test)

In [None]:
X_train_final = np.load('embeddings/tfidf_unique_full_train.npy', allow_pickle = True)
y = np.load('embeddings/tfidf_unique_full_labels.npy', allow_pickle = True)
X_test = np.load('embeddings/tfidf_unique_full_test.npy', allow_pickle = True)

In [None]:
X_train_final