In [1]:
#setting the Keras backend as Tensorflow
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
import keras as ks

Using TensorFlow backend.


In [2]:
import pandas as pd # provide sql-like data manipulation tools. very handy.
pd.options.mode.chained_assignment = None
import numpy as np # high dimensional vector computing library.
from copy import deepcopy
from string import punctuation
from random import shuffle

import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class
LabeledSentence = gensim.models.doc2vec.LabeledSentence # we'll talk about this down below

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer



In [3]:
#function for data preprocessing
def ingest():
    data = pd.read_csv('C:\\Users\\bhumi\\Desktop\\repo\\NLP\\twitter_Sentiment_Train.csv', encoding = "latin1")
    data = data[data.Sentiment.isnull() == False]
    data['Sentiment'] = data['Sentiment'].map( {4:1, 0:0} )
    #data['Sentiment'] = data['Sentiment'].map(int)
    data = data[data['SentimentText'].isnull() == False]
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    print('dataset loaded with shape', data.shape)
    return data

data = ingest()
print(data.head(5))
n=data.shape[0]



dataset loaded with shape (1048575, 2)
   Sentiment                                      SentimentText
0          0  is upset that he can't update his Facebook by ...
1          0  @Kenichan I dived many times for the ball. Man...
2          0    my whole body feels itchy and like its on fire 
3          0  @nationwideclass no, it's not behaving at all....
4          0                      @Kwesidei not the whole crew 


In [4]:
#tokenizing function that splits each tweet into tokens and removes user mentions, hashtags and urls
def tokenize(tweet):
    try:
        #tweet = unicode(tweet.decode('utf-8').lower())
        tweet = tweet.lower()
        tokens = tokenizer.tokenize(tweet)
        tokens = filter(lambda t: not t.startswith('@'), tokens)
        tokens = filter(lambda t: not t.startswith('#'), tokens)
        tokens = filter(lambda t: not t.startswith('http'), tokens)
        return list(tokens)
    except:
        return 'NC'

#The results of the tokenization should now be cleaned to remove lines with 'NC', resulting from a tokenization error
def postprocess(data, n=1600000):
    data = data.head(n)
    data['tokens'] = data['SentimentText'].progress_map(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
    data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

data = postprocess(data,n)



progress-bar: 100%|████████████████████████████████████████████████████████| 1048575/1048575 [02:17<00:00, 7602.39it/s]


In [5]:
#Build the word2vec model
x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(n).tokens),
                                                    np.array(data.head(n).Sentiment), test_size=0.2)



In [6]:
def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')
print(x_train[0])

  """
838860it [00:08, 96070.35it/s] 
209715it [00:02, 77973.02it/s]


LabeledSentence(['i', 'stood', 'in', 'it'], ['TRAIN_0'])


In [7]:
#Building word2vec of 200 Dimension
n_dim=200
tweet_w2v = Word2Vec(size=n_dim, min_count=10)
tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
tweet_w2v.train([x.words for x in tqdm(x_train)],total_examples=tweet_w2v.corpus_count,epochs=tweet_w2v.iter)
#test built word2vec model
print(tweet_w2v['good'])
print(tweet_w2v.most_similar('good'))

100%|█████████████████████████████████████████████████████████████████████| 838860/838860 [00:00<00:00, 1162560.98it/s]
100%|█████████████████████████████████████████████████████████████████████| 838860/838860 [00:00<00:00, 1169051.23it/s]
  after removing the cwd from sys.path.
  


[ 1.00661612  1.60322595 -0.0285238  -0.29310158  2.27271032  0.23961651
 -0.85697293 -0.78719872  0.1139839   1.61173224 -0.78645813  0.17944191
 -2.2367816  -0.99291098  1.42722046 -0.08854914  0.50589973  1.30507839
  0.64027238  1.20982063  1.66881382  0.17526485 -1.12502837  0.5388692
 -0.50227892  0.95561224 -1.06664431 -1.23184681 -1.95974958  1.59780836
 -1.23589766 -0.41052991  1.28144324 -1.20835114 -3.21540833 -1.03669298
 -1.35284543  0.1136034  -2.06808567 -0.92073929  2.38989377 -0.32028002
 -0.34214631 -0.60228592  0.66093701 -1.2007153  -1.37734532 -0.71835512
  0.4950093   1.17118609  0.14439274  0.97243285  1.53685832  0.26616859
  0.93904871 -0.268231   -0.32518822 -0.96176201 -1.3345896  -1.07238901
  1.33539546 -0.45341572  2.51974535  1.22182751 -0.52450651  0.55051321
  0.08234909 -0.38503018 -1.8710115   0.99669969  1.24302304 -1.09327352
 -0.51814073  0.54013836 -0.10833254 -0.8102088  -2.18693709  0.84348094
  0.32521605  0.15980409  1.76375842  2.39031863  0.

  import sys


[('goood', 0.7170765399932861), ('great', 0.700944721698761), ('pleasant', 0.6300460696220398), ('tough', 0.6253600120544434), ('nice', 0.621490478515625), ('fantastic', 0.6195719838142395), ('gd', 0.6188822984695435), ('rough', 0.618777871131897), ('gud', 0.6020275354385376), ('brilliant', 0.6000094413757324)]


In [8]:
print('building tf-idf matrix ...')
#buliding word to ID mapping
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print('vocab size :', len(tfidf))



building tf-idf matrix ...
vocab size : 23938


In [9]:
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec



In [10]:
#training Word2vec
from sklearn.preprocessing import scale
print('building train combines word_vectors with tf-idf ...')
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)
print('train_vecs_w2v shape', train_vecs_w2v.shape)
print('building test combines word_vectors with tf-idf ...')
test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)
print('test_vecs_w2v shape', test_vecs_w2v.shape)


building train combines word_vectors with tf-idf ...


  
838860it [03:39, 3817.58it/s]


train_vecs_w2v shape (838860, 200)
building test combines word_vectors with tf-idf ...


209715it [00:56, 3699.03it/s]


test_vecs_w2v shape (209715, 200)


In [11]:
from keras.models import Sequential
from keras.layers import Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input,  Flatten
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D, Embedding
from keras.callbacks import ModelCheckpoint

print('begin to train DNN model for sentiment analysis...')
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=n_dim))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(train_vecs_w2v, y_train, epochs=100, batch_size=256, verbose=2)

print('Evaluate trained model on test dataset...')
score = model.evaluate(test_vecs_w2v, y_test, batch_size=256, verbose=2)
print('Accuracy: ', score[1])

begin to train DNN model for sentiment analysis...
Epoch 1/100
 - 11s - loss: 0.3795 - acc: 0.8348
Epoch 2/100
 - 10s - loss: 0.3603 - acc: 0.8441
Epoch 3/100
 - 10s - loss: 0.3553 - acc: 0.8468
Epoch 4/100
 - 10s - loss: 0.3522 - acc: 0.8479
Epoch 5/100
 - 11s - loss: 0.3498 - acc: 0.8490
Epoch 6/100
 - 10s - loss: 0.3481 - acc: 0.8499
Epoch 7/100
 - 10s - loss: 0.3467 - acc: 0.8505
Epoch 8/100
 - 10s - loss: 0.3456 - acc: 0.8511
Epoch 9/100
 - 10s - loss: 0.3446 - acc: 0.8515
Epoch 10/100
 - 10s - loss: 0.3437 - acc: 0.8517
Epoch 11/100
 - 11s - loss: 0.3429 - acc: 0.8522
Epoch 12/100
 - 11s - loss: 0.3423 - acc: 0.8527
Epoch 13/100
 - 11s - loss: 0.3417 - acc: 0.8528
Epoch 14/100
 - 11s - loss: 0.3412 - acc: 0.8532
Epoch 15/100
 - 10s - loss: 0.3408 - acc: 0.8534
Epoch 16/100
 - 11s - loss: 0.3403 - acc: 0.8535
Epoch 17/100
 - 10s - loss: 0.3398 - acc: 0.8538
Epoch 18/100
 - 10s - loss: 0.3395 - acc: 0.8540
Epoch 19/100
 - 11s - loss: 0.3392 - acc: 0.8539
Epoch 20/100
 - 11s - loss: