In [1]:
%load_ext autoreload

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 
from nltk.tokenize import TweetTokenizer
from sklearn.svm import LinearSVC
import helpers



In [2]:
# Load training set

full = True

if full: 
    pos_filename = 'twitter-datasets/train_pos_full_u.txt'
    neg_filename = 'twitter-datasets/train_neg_full_u.txt'
else: 
    pos_filename = 'twitter-datasets/train_pos_u.txt'
    neg_filename = 'twitter-datasets/train_neg_u.txt'


pos_tweets = helpers.txt_to_list(pos_filename)
neg_tweets = helpers.txt_to_list(neg_filename)

# Create a labeled dataset 
all_tweets, y = helpers.merge_shuffle_label(pos_tweets, neg_tweets)

# Split into train and validation sets
training_fraction = 0.8
train, val, y_train, y_val = helpers.split_dataset(training_fraction, all_tweets, y)

### Compute TF-IDF

In [3]:
import preprocessor as p
from preprocessing import process_sentence, to_vec, split_hashtag, remove_repeats, remove_informal_contractions

def tk(sent):
    tokens = p.tokenize(sent).split()
    return tokens

def tk2(sent):
    tokens = p.tokenize(sent).split()
    return pre.process_sentence(tokens, pre.preproc_pipeline)

preproc_pipeline = [to_vec(split_hashtag),  
                    to_vec(remove_repeats)]

def tk3(sent):
    tokens = TweetTokenizer().tokenize(sent)
    return process_sentence(tokens, preproc_pipeline)

#preproc_pipeline = [to_vec(split_hashtag), 
                  # to_vec(words_to_tags), 
                  # to_vec(remove_repeats)]

def tk4(sent):
    tokens = TweetTokenizer().tokenize(sent)
    return process_sentence(tokens, preproc_pipeline2)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alexf\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alexf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Build training vectorization 
TFIDF = True

if TFIDF:
    vect=TfidfVectorizer(use_idf=True, ngram_range = (1,2), tokenizer = tk3, sublinear_tf = False)
    X_train = vect.fit_transform(train)
    X_val = vect.transform(val)
else:
    vect=CountVectorizer(ngram_range = (1,2), tokenizer = tk3)
    X_train = vect.fit_transform(train)
    X_val = vect.transform(val)

KeyboardInterrupt: 

### Test a few classifiers on validation set

In [None]:
from sklearn.svm import LinearSVC

clf = LinearSVC(random_state=0, tol=1e-9, loss = 'squared_hinge', dual = True, C = 0.03)
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

In [19]:
df = pd.DataFrame(clf.coef_.flatten(), index=vect.get_feature_names(), columns=["coefficient"]) 
n_ = 100
pd.set_option('display.max_rows', 100)

print('{:d} strongest bigrams to indicate positive sentiment'.format(n_))
df.sort_values(by=["coefficient"],ascending=False).head(n_)


100 strongest bigrams to indicate positive sentiment


Unnamed: 0,coefficient
( >,2.001165
no problem,1.044102
( (,1.00477
ways to make me hapy,0.899695
not mising,0.881708
you get major points if,0.867333
),0.857437
( ^,0.839051
can't wait,0.81774
cant wait,0.789983


In [20]:
print('{:d} strongest bigrams to indicate negative sentiment'.format(n_))
df.sort_values(by=["coefficient"],ascending=False).tail(n_).sort_values(by=["coefficient"],ascending=True)

100 strongest bigrams to indicate negative sentiment


Unnamed: 0,coefficient
(8,-1.205499
=) ),-1.107342
(,-1.094412
sad twet,-1.043841
not nice,-0.935314
) >,-0.92566
its sad that,-0.90703
sadest,-0.9062
sad,-0.887672
) ),-0.881423


In [21]:
df = pd.DataFrame.from_dict(dict(zip(['tweet', 'coefficient', 'label'], [val, clf.decision_function(X_val), y_val])))
df.set_index('tweet', inplace = True)
n_ = 10
print('{:d} most confident correct predictions of positive tweets'.format(n_))
df.query('label == 1').sort_values(by= 'coefficient', ascending = False).head(n_)

10 most confident correct predictions of positive tweets


Unnamed: 0_level_0,coefficient,label
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1
today's gonna be a good day . 4/20 4/20 4/20 4/20 4/20 4/20 4/20 4/20 4/20 4/20 happy four tweenyyy stoners\n,6.540538,1
yay yay yay yay ) ) chris said he'd sing me a song tomorrow if i go ) woohooo whooohoo but its a surprise which song : b\n,5.781296,1
<user> awww ur welcome i can tell ur so excited i would be too ! ! ) i am happy for you girl ! ! ) ) <3\n,4.830495,1
<user> goodmorning and you have a nice day as well ! thank you and god bless you ! keep that smile on your face ) proverbs 15:13\n,4.736957,1
<user> <user> please greet my cousin ) raya ) we are listening right now ) and please play the song young wild and free ) ...\n,4.674764,1
let's just enjoy life .. smile and don't let anyone ruin your day . ) ) ) smile my dear ) have a great day ! ! imissyou <3\n,4.652457,1
<user> please greet my cousin ) raya ) we are listening right now ) and please play the song young wild and free )\n,4.588453,1
"<user> wants that grace , grace , grace , grace , grace , grace , grace , grace , grace , grace , grace , grace , grace , grace , grace !\n",4.570932,1
makasii tata .. ) rt <user> happy birthdayy <user> ! hope your day is filled with happiness . god bless ! )\n,4.528263,1
"<user> finally ! ! ! best scene ever ! i got butterflies in my stomach ! thank you , thank you , thank youu ! ) elena and damon is trending :d\n",4.43042,1


In [22]:
print('{:d} most confident incorrect predictions of positive tweets'.format(n_))
df.query('label == 1').sort_values(by= 'coefficient', ascending = True).head(n_)

10 most confident incorrect predictions of positive tweets


Unnamed: 0_level_0,coefficient,label
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1
( subject homonyms ( word dew ( assignment use the word in a sentence ( student i wear a dew rag . - - - really ? ! ? !\n,-3.945326,1
"i just wish i could hear that you got beat tf up so bad that your in the hospital , bloody , broken bones , all that hatee this nigga .\n",-3.51909,1
"my timeline is full of sick people that include me .. ader sakit kepala , sakit mata , sakit nak demam .. eh eh sakit hati pon ader ..\n",-3.351556,1
"pride aside , i miss him . i miss what we were & was hoping for a friendship . but it didnt work like that . ( miss that baton too\n",-3.29097,1
crying crying crying crying crying <user> <3 #lifeonmurs\n,-3.263616,1
i woke up crying ... also i was late for school and forgot my glasses at home so i had to cross this mean old mans garden but he let me go\n,-3.136128,1
<user> i hate science ! my friend and i were just talking about how we hate it . it's so hard and confusing and ughh i hate it !\n,-3.133745,1
<user> no i'm not gay ! the spider was huge and just gross . i cried because the show was sad . sorry im an emotional guy #hopoff\n,-3.073798,1
my tan is fading fast . this maketh me sad ... and i don't like feeling sad . some 1 send me some jokes\n,-3.022196,1
<user> how sad you never wish my birthday yesterday .. mind to follow me back it just a dreaming to get wish from you . #120\n,-2.924709,1


In [23]:
print('{:d} most confident correct predictions of negative tweets'.format(n_))
df.query('label == 0').sort_values(by= 'coefficient', ascending = True).head(n_)

10 most confident correct predictions of negative tweets


Unnamed: 0_level_0,coefficient,label
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1
it's cold in here ( ( and i'm all alone ( ( i'm cold : ( ( ( guise i can't breathe ( i hate rummm !\n,-7.723273,0
seriously huhu this is so depressing ( ( ( still watching ollg vids ( ( ( wooo ( ( ( may 10 huhuhuhu pls come back\n,-6.298245,0
smart debt solution : will credit counseling help improve ... : bank ( 4 bankruptcy ( 4 budget ( 23 business ( 1 ... <url>\n,-5.966434,0
"lonely i'm so lonely i have nobody to call my own i'm so lonely , i'm mr . lonely i have nobody to call my own i'm so lonely\n",-5.762936,0
no no no ! ! why did mike haveto die ? ? ? i've never cried so much in all my life ! dh wont be the same without him ! ! the funeral was soo sad\n,-5.700586,0
"hikaru no go , vol . 14 ( hikaru no go ( graphic novels ) ) ( paperback r to l ( japanese style ) after stumbling across ... <url>\n",-5.674365,0
"<user> yep apparently ( ( day ruined ( ( do u have any suggestion to give me ? i can't send dms anymore , don't know what to do ( (\n",-5.606302,0
why must sleep torture me with nightmares ugh this day sucks never been so heart broke in my life i need someone here please help me\n,-5.605954,0
tour depression tour depression tour depression ! can't believe the 4d tour is over in london whyy ? ! can't it just last forever ?\n,-5.539437,0
rose to miss rest of season with torn acl ( yahoo ! sports chicago ( ap ) bulls star derrick rose will miss the re ... <url>\n,-5.5067,0


In [24]:
print('{:d} most confident incorrect predictions of negative tweets'.format(n_))
df.query('label == 0').sort_values(by= 'coefficient', ascending = False).head(n_)

10 most confident incorrect predictions of negative tweets


Unnamed: 0_level_0,coefficient,label
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1
"my younger sister , has got her birthday today ! so we ve a big party ! ! :/ ) tomorrow for her :/ / ) ! ! ! happy . birthday . :/ ) <3\n",4.797551,0
<user> 1 ) you're humble 2 ) cool hair 3 ) realistic 4 ) kickass taste in music 5 ) not a spammer ... i don't have enough characters\n,3.264121,0
"<user> <user> lmao mhmmm , 2strokez - - - gotta say it fast ---> that all i got , that all i got . ! ) .. lol na ty will get the job done . !\n",3.252798,0
"<user> <user> thank you both for your words , i appreciate it greatly ! thank you ! ! )\n",3.136788,0
"me singing * if that's ya hoe , thats my hoe too * ) smiley : ayeee ! that's that shit\n",3.020325,0
"boy rubbing on leg ) so wats ur horoscope sign ? girl : "" stop "" boy moves hand ) so wat is it ? girl : nigga the red sign that says stop cuz u need 2\n",2.983551,0
"<user> hahahahah ! christie not'a happy bunny with twitter uhh'ohh hahah ! im seeing you tonightt , yay yaay yaaay ! ! ! #loveyou\n",2.787386,0
"lol "" <user> .. ) i tweet a fuck of a lot sometimes . so while it's great to meet you on twitter , you will unfollow me . happy , haters ? ""\n",2.783875,0
"you : hi person :d o i know u you : no but i know u person , wth ( calls police ) you get arrested ) you get out of jail ) person restraning order )\n",2.687425,0
hey <user> thanks for the ff . happy friday )\n,2.673956,0


In [12]:
from sklearn.naive_bayes import MultinomialNB


    

mdb = MultinomialNB()
mdb.fit(X_train, y_train)

helpers.judge_pred(mdb, X_train, X_val, y_train, y_val)

Training set accuracy: 86.35% / validation set: 81.52%


In [7]:
from sklearn.naive_bayes import BernoulliNB


    

clf = BernoulliNB()
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 80.62% / validation set: 75.64%


In [8]:
from sklearn.naive_bayes import GaussianNB


clf = GaussianNB()
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 80.62% / validation set: 75.64%


In [28]:
from sklearn import linear_model

clf =  linear_model.SGDClassifier(loss = 'hinge', max_iter=int(1e7), tol=1e-5, verbose = False)
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 85.34% / validation set: 84.44%


In [10]:
from sklearn import linear_model

clf =  linear_model.Perceptron()
clf.fit(X_train, y_train)

helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

Training set accuracy: 96.80% / validation set: 81.79%


In [12]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=0)  
clf.fit(X_train, y_train)


helpers.judge_pred(clf, X_train, X_val, y_train, y_val)

MemoryError: Unable to allocate 397. MiB for an array with shape (52029241,) and data type float64

### Predictions

In [10]:
# Compute tf-idf on full training set      
vect=TfidfVectorizer(use_idf=True, ngram_range = (1,2), tokenizer = tk3, sublinear_tf = False)
print("1")
X_train_final = vect.fit_transform(all_tweets)
print("2")
# Check training accuracy
clf = LinearSVC(random_state=0, tol=1e-9, loss = 'squared_hinge', dual = True, C = 0.01)
clf.fit(X_train_final, y)

train_acc = (clf.predict(X_train_final) == y).mean()
print('Training set accuracy: {:.2f}%'.format(100*train_acc))

1
2
Training set accuracy: 84.93%


In [11]:
# Prepare test set
test_tweets = []
with open('twitter-datasets/test_data.txt', encoding = 'utf-8') as f: 
    for line in f:
        sp = line.split(',')
        index = sp[0]
        test_tweets.append(','.join(sp[1:]))
X_test = vect.transform(test_tweets)
df = pd.DataFrame(clf.decision_function(X_test), columns = ['Decision_function'])
df.to_csv('stage2data/pred_test_tfidf.csv')


PermissionError: [Errno 13] Permission denied: 'stage2data/pred_test_tfidf.csv'

In [20]:
# Prepare test set
tweets = []
with open('stage2data/train_neg_full_u.txt', encoding = 'utf-8') as f: # stage2data/train_all_full_u.txt
    for line in f:
        tweets.append(line)
X = vect.transform(tweets)
df = pd.DataFrame(clf.decision_function(X), columns = ['Decision_function'])
df.to_csv('stage2data/pred_train_neg_u_tfidf.csv')
np.save('stage2data/tfidf_train_neg_u.npy', X)
