In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score, precision_score,f1_score

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [2]:
cols = ['sentiment','id','date','query_string','user','text']
data = pd.read_csv("data/train.csv",header=None, names=cols, encoding = "ISO-8859-1")

In [3]:
data.head()

Unnamed: 0,sentiment,id,date,query_string,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
sentiment       1600000 non-null int64
id              1600000 non-null int64
date            1600000 non-null object
query_string    1600000 non-null object
user            1600000 non-null object
text            1600000 non-null object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [5]:
data['query_string'].value_counts()

NO_QUERY    1600000
Name: query_string, dtype: int64

*query_string is a constant column so we can drop that. id, date and user columns does not also help
in finding the tweet sentiment.*

In [6]:
data.drop(['id','date','query_string','user'],axis=1,inplace=True)

In [7]:
data.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [8]:
data['sentiment'].value_counts()

4    800000
0    800000
Name: sentiment, dtype: int64

In [9]:
data.loc[data['sentiment']> 0, ['sentiment']] = 1

In [10]:
data['sentiment'].value_counts()

1    800000
0    800000
Name: sentiment, dtype: int64

# Data Preprocessing

In [11]:
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself'
             , 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself','they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
             "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and',
             'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'for','with', 'about', 'between', 'through', 'during', 'before', 'after', 'below',
             'to', 'from', 'up', 'in', 'out', 'on', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all','both', 'each', 'more',
             'most', 'other', 'some', 'such', 'same', 'so', 'than', 'very', 's', 't', 'can', 'will', 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y']

data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stopwords)]))

# Data Splitting

In [12]:
from sklearn.model_selection import train_test_split
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(data.text
, data.sentiment, test_size=.04, random_state=100)
x_validation, x_test, y_validation, y_test =  train_test_split(x_validation_and_test,
y_validation_and_test, test_size=.5, random_state=100)

In [13]:
print(x_train.shape, x_train[y_train>0].shape, x_train[y_train==0].shape)
print(x_validation.shape, x_validation[y_validation>0].shape, x_validation[y_validation==0].shape)
print(x_test.shape, x_test[y_test>0].shape, x_test[y_test==0].shape)

(1536000,) (768081,) (767919,)
(32000,) (15946,) (16054,)
(32000,) (15973,) (16027,)


# Count Vectorization

In [14]:
def eval_metric(prediction, truth):
    print("recall:       {}".format(recall_score(prediction, truth)))
    print("precision:    {}".format(precision_score(prediction, truth)))
    print("f1_score:     {}".format(f1_score(prediction, truth)))
    print("\nconfusion_matrix:\n",confusion_matrix(prediction, truth))

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# CVec = CountVectorizer()
# num_of_features = [10000, 20000, 50000]
# for n_features in num_of_features:
    
#     CVec.set_params(max_features = n_features)
#     CVec.fit(x_train)
    
#     x_train_vector = CVec.transform(x_train)
#     x_validation_vector = CVec.transform(x_validation)
    
#     logreg = LogisticRegression()
#     logreg.fit(x_train_vector, y_train)
#     lr_prediction = logreg.predict(x_validation_vector)
#     eval_metric(lr_prediction, y_validation)

# Word2Vec

In [19]:
# This function converts a text to a sequence of words.
def wordlist(sentence):
    words = sentence.lower().split()
    return(words)

tweets = []
for tweet in x_train:
    tweets.append(wordlist(tweet))
print(len(tweets))

1536000


In [20]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [21]:
# Creating the model and setting values for the various parameters
num_features = 300  # Word vector dimensionality
min_word_count = 40 # Minimum word count
num_workers = 4     # Number of parallel threads
context = 10        # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words

# Initializing the train model
from gensim.models import word2vec
print("Training model....")
model = word2vec.Word2Vec(tweets,\
                          workers=num_workers,\
                          size=num_features,\
                          min_count=min_word_count,\
                          window=context,
                          sample=downsampling)

# To make the model memory efficient
model.init_sims(replace=True)

# Saving the model for later use. Can be loaded using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2018-11-20 19:48:42,105 : INFO : collecting all words and their counts
2018-11-20 19:48:42,106 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-11-20 19:48:42,128 : INFO : PROGRESS: at sentence #10000, processed 85040 words, keeping 26607 word types
2018-11-20 19:48:42,155 : INFO : PROGRESS: at sentence #20000, processed 170553 words, keeping 45177 word types
2018-11-20 19:48:42,178 : INFO : PROGRESS: at sentence #30000, processed 255916 words, keeping 61258 word types
2018-11-20 19:48:42,201 : INFO : PROGRESS: at sentence #40000, processed 341544 words, keeping 76099 word types
2018-11-20 19:48:42,233 : INFO : PROGRESS: at sentence #50000, processed 427514 words, keeping 90342 word types
2018-11-20 19:48:42,259 : INFO : PROGRESS: at sentence #60000, processed 513674 words, keeping 103981 word types
2018-11-20 19:48:42,283 : INFO : PROGRESS: at sentence #70000, processed 599555 words, keeping 116853 word types


Training model....


2018-11-20 19:48:42,314 : INFO : PROGRESS: at sentence #80000, processed 685390 words, keeping 129441 word types
2018-11-20 19:48:42,340 : INFO : PROGRESS: at sentence #90000, processed 771365 words, keeping 141582 word types
2018-11-20 19:48:42,368 : INFO : PROGRESS: at sentence #100000, processed 857442 words, keeping 153214 word types
2018-11-20 19:48:42,394 : INFO : PROGRESS: at sentence #110000, processed 943544 words, keeping 164618 word types
2018-11-20 19:48:42,429 : INFO : PROGRESS: at sentence #120000, processed 1029470 words, keeping 175829 word types
2018-11-20 19:48:42,455 : INFO : PROGRESS: at sentence #130000, processed 1114863 words, keeping 186863 word types
2018-11-20 19:48:42,481 : INFO : PROGRESS: at sentence #140000, processed 1200408 words, keeping 197538 word types
2018-11-20 19:48:42,507 : INFO : PROGRESS: at sentence #150000, processed 1286761 words, keeping 208259 word types
2018-11-20 19:48:42,535 : INFO : PROGRESS: at sentence #160000, processed 1373620 word

2018-11-20 19:48:44,476 : INFO : PROGRESS: at sentence #800000, processed 6868996 words, keeping 723964 word types
2018-11-20 19:48:44,509 : INFO : PROGRESS: at sentence #810000, processed 6953922 words, keeping 730528 word types
2018-11-20 19:48:44,542 : INFO : PROGRESS: at sentence #820000, processed 7040396 words, keeping 737259 word types
2018-11-20 19:48:44,572 : INFO : PROGRESS: at sentence #830000, processed 7126514 words, keeping 743871 word types
2018-11-20 19:48:44,601 : INFO : PROGRESS: at sentence #840000, processed 7212314 words, keeping 750531 word types
2018-11-20 19:48:44,631 : INFO : PROGRESS: at sentence #850000, processed 7298116 words, keeping 756788 word types
2018-11-20 19:48:44,659 : INFO : PROGRESS: at sentence #860000, processed 7382903 words, keeping 763092 word types
2018-11-20 19:48:44,690 : INFO : PROGRESS: at sentence #870000, processed 7469108 words, keeping 769544 word types
2018-11-20 19:48:44,721 : INFO : PROGRESS: at sentence #880000, processed 755471

2018-11-20 19:48:46,502 : INFO : PROGRESS: at sentence #1510000, processed 12958306 words, keeping 1145206 word types
2018-11-20 19:48:46,529 : INFO : PROGRESS: at sentence #1520000, processed 13044091 words, keeping 1150731 word types
2018-11-20 19:48:46,560 : INFO : PROGRESS: at sentence #1530000, processed 13130324 words, keeping 1156121 word types
2018-11-20 19:48:46,577 : INFO : collected 1159308 word types from a corpus of 13182160 raw words and 1536000 sentences
2018-11-20 19:48:46,578 : INFO : Loading a fresh vocabulary
2018-11-20 19:48:47,009 : INFO : effective_min_count=40 retains 19865 unique words (1% of original 1159308, drops 1139443)
2018-11-20 19:48:47,010 : INFO : effective_min_count=40 leaves 10697613 word corpus (81% of original 13182160, drops 2484547)
2018-11-20 19:48:47,061 : INFO : deleting the raw counts dictionary of 1159308 items
2018-11-20 19:48:47,092 : INFO : sample=0.001 downsamples 44 most-common words
2018-11-20 19:48:47,093 : INFO : downsampling leaves 

2018-11-20 19:49:34,521 : INFO : EPOCH 5 - PROGRESS: at 28.27% examples, 927500 words/s, in_qsize 7, out_qsize 0
2018-11-20 19:49:35,521 : INFO : EPOCH 5 - PROGRESS: at 37.29% examples, 919836 words/s, in_qsize 7, out_qsize 0
2018-11-20 19:49:36,526 : INFO : EPOCH 5 - PROGRESS: at 46.61% examples, 920404 words/s, in_qsize 7, out_qsize 0
2018-11-20 19:49:37,534 : INFO : EPOCH 5 - PROGRESS: at 56.24% examples, 925172 words/s, in_qsize 7, out_qsize 0
2018-11-20 19:49:38,541 : INFO : EPOCH 5 - PROGRESS: at 65.95% examples, 930022 words/s, in_qsize 7, out_qsize 0
2018-11-20 19:49:39,546 : INFO : EPOCH 5 - PROGRESS: at 75.58% examples, 932804 words/s, in_qsize 7, out_qsize 0
2018-11-20 19:49:40,548 : INFO : EPOCH 5 - PROGRESS: at 85.22% examples, 935438 words/s, in_qsize 7, out_qsize 0
2018-11-20 19:49:41,551 : INFO : EPOCH 5 - PROGRESS: at 94.71% examples, 935715 words/s, in_qsize 7, out_qsize 0
2018-11-20 19:49:42,075 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-

In [34]:
model.wv.doesnt_match("man woman dog child kitchen".split())

'kitchen'

In [35]:
model.wv.most_similar("man")

[('woman', 0.5502930879592896),
 ('man.', 0.5406871438026428),
 ('man..', 0.5360029339790344),
 ('man,', 0.5036314725875854),
 ('boy', 0.5022791624069214),
 ('man!', 0.4940132200717926),
 ('man...', 0.4925822913646698),
 ('men', 0.4890204966068268),
 ('girl', 0.48486149311065674),
 ('dude', 0.47440433502197266)]

In [36]:
model.wv.most_similar("awful")

[('horrible', 0.8192330002784729),
 ('terrible', 0.8069216012954712),
 ('awful.', 0.7577972412109375),
 ('horrible.', 0.7542978525161743),
 ('terrible.', 0.7356346845626831),
 ('horrid', 0.7129672765731812),
 ('horrible,', 0.71146160364151),
 ('awful,', 0.7110568284988403),
 ('awful!', 0.7045435905456543),
 ('horrible!', 0.6523497104644775)]

In [37]:
model.wv.syn0.shape

(19865, 300)

In [40]:
# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

# Function for calculating the average feature vector
def getAvgFeatureVecs(tweets, model, num_features):
    counter = 0
    tweetFeatureVecs = np.zeros((len(tweets),num_features),dtype="float32")
    for tweet in tweets:
        # Printing a status message every 1000th tweet
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(tweets)))
            
        tweetFeatureVecs[counter] = featureVecMethod(tweet, model, num_features)
        counter = counter+1
        
    return tweetFeatureVecs

# Calculating average feature vector for training set
train_tweets = []
for tweet in x_train:
    train_tweets.append(wordlist(tweet))

trainDataVecs = getAvgFeatureVecs(train_tweets, model, num_features)

Review 0 of 1536000
Review 1000 of 1536000
Review 2000 of 1536000
Review 3000 of 1536000
Review 4000 of 1536000
Review 5000 of 1536000
Review 6000 of 1536000
Review 7000 of 1536000
Review 8000 of 1536000
Review 9000 of 1536000
Review 10000 of 1536000
Review 11000 of 1536000
Review 12000 of 1536000
Review 13000 of 1536000
Review 14000 of 1536000
Review 15000 of 1536000
Review 16000 of 1536000
Review 17000 of 1536000
Review 18000 of 1536000
Review 19000 of 1536000
Review 20000 of 1536000
Review 21000 of 1536000
Review 22000 of 1536000
Review 23000 of 1536000
Review 24000 of 1536000
Review 25000 of 1536000
Review 26000 of 1536000
Review 27000 of 1536000
Review 28000 of 1536000
Review 29000 of 1536000
Review 30000 of 1536000
Review 31000 of 1536000
Review 32000 of 1536000
Review 33000 of 1536000
Review 34000 of 1536000
Review 35000 of 1536000
Review 36000 of 1536000
Review 37000 of 1536000
Review 38000 of 1536000
Review 39000 of 1536000
Review 40000 of 1536000
Review 41000 of 1536000
Revie

Review 333000 of 1536000
Review 334000 of 1536000
Review 335000 of 1536000
Review 336000 of 1536000
Review 337000 of 1536000
Review 338000 of 1536000
Review 339000 of 1536000
Review 340000 of 1536000
Review 341000 of 1536000
Review 342000 of 1536000
Review 343000 of 1536000
Review 344000 of 1536000
Review 345000 of 1536000
Review 346000 of 1536000
Review 347000 of 1536000
Review 348000 of 1536000
Review 349000 of 1536000
Review 350000 of 1536000
Review 351000 of 1536000
Review 352000 of 1536000
Review 353000 of 1536000
Review 354000 of 1536000
Review 355000 of 1536000
Review 356000 of 1536000
Review 357000 of 1536000
Review 358000 of 1536000
Review 359000 of 1536000
Review 360000 of 1536000
Review 361000 of 1536000
Review 362000 of 1536000
Review 363000 of 1536000
Review 364000 of 1536000
Review 365000 of 1536000
Review 366000 of 1536000
Review 367000 of 1536000
Review 368000 of 1536000
Review 369000 of 1536000
Review 370000 of 1536000
Review 371000 of 1536000
Review 372000 of 1536000


Review 661000 of 1536000
Review 662000 of 1536000
Review 663000 of 1536000
Review 664000 of 1536000
Review 665000 of 1536000
Review 666000 of 1536000
Review 667000 of 1536000
Review 668000 of 1536000
Review 669000 of 1536000
Review 670000 of 1536000
Review 671000 of 1536000
Review 672000 of 1536000
Review 673000 of 1536000
Review 674000 of 1536000
Review 675000 of 1536000
Review 676000 of 1536000
Review 677000 of 1536000
Review 678000 of 1536000
Review 679000 of 1536000
Review 680000 of 1536000
Review 681000 of 1536000
Review 682000 of 1536000
Review 683000 of 1536000
Review 684000 of 1536000
Review 685000 of 1536000
Review 686000 of 1536000
Review 687000 of 1536000
Review 688000 of 1536000
Review 689000 of 1536000
Review 690000 of 1536000
Review 691000 of 1536000
Review 692000 of 1536000
Review 693000 of 1536000
Review 694000 of 1536000
Review 695000 of 1536000
Review 696000 of 1536000
Review 697000 of 1536000
Review 698000 of 1536000
Review 699000 of 1536000
Review 700000 of 1536000


Review 989000 of 1536000
Review 990000 of 1536000
Review 991000 of 1536000
Review 992000 of 1536000
Review 993000 of 1536000
Review 994000 of 1536000
Review 995000 of 1536000
Review 996000 of 1536000
Review 997000 of 1536000
Review 998000 of 1536000
Review 999000 of 1536000
Review 1000000 of 1536000
Review 1001000 of 1536000
Review 1002000 of 1536000
Review 1003000 of 1536000
Review 1004000 of 1536000
Review 1005000 of 1536000
Review 1006000 of 1536000
Review 1007000 of 1536000
Review 1008000 of 1536000
Review 1009000 of 1536000
Review 1010000 of 1536000
Review 1011000 of 1536000
Review 1012000 of 1536000
Review 1013000 of 1536000
Review 1014000 of 1536000
Review 1015000 of 1536000
Review 1016000 of 1536000
Review 1017000 of 1536000
Review 1018000 of 1536000
Review 1019000 of 1536000
Review 1020000 of 1536000
Review 1021000 of 1536000
Review 1022000 of 1536000
Review 1023000 of 1536000
Review 1024000 of 1536000
Review 1025000 of 1536000
Review 1026000 of 1536000
Review 1027000 of 15360

Review 1305000 of 1536000
Review 1306000 of 1536000
Review 1307000 of 1536000
Review 1308000 of 1536000
Review 1309000 of 1536000
Review 1310000 of 1536000
Review 1311000 of 1536000
Review 1312000 of 1536000
Review 1313000 of 1536000
Review 1314000 of 1536000
Review 1315000 of 1536000
Review 1316000 of 1536000
Review 1317000 of 1536000
Review 1318000 of 1536000
Review 1319000 of 1536000
Review 1320000 of 1536000
Review 1321000 of 1536000
Review 1322000 of 1536000
Review 1323000 of 1536000
Review 1324000 of 1536000
Review 1325000 of 1536000
Review 1326000 of 1536000
Review 1327000 of 1536000
Review 1328000 of 1536000
Review 1329000 of 1536000
Review 1330000 of 1536000
Review 1331000 of 1536000
Review 1332000 of 1536000
Review 1333000 of 1536000
Review 1334000 of 1536000
Review 1335000 of 1536000
Review 1336000 of 1536000
Review 1337000 of 1536000
Review 1338000 of 1536000
Review 1339000 of 1536000
Review 1340000 of 1536000
Review 1341000 of 1536000
Review 1342000 of 1536000
Review 13430