In [47]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
from copy import deepcopy
from string import punctuation
from random import shuffle
import pickle
import h5py
import json
import matplotlib.pyplot as plt 

import gensim
from gensim.models.word2vec import Word2Vec

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
from nltk import word_tokenize

from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.utils import np_utils
from keras.callbacks import Callback
from keras.models import model_from_json

# importing bokeh library for interactive dataviz
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

  from pandas import Panel


In [73]:
# Combining all Data

import glob

path = r'Data/downloaded' # use your path
all_files = glob.glob(path + "/*.tsv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None,sep='\t', encoding='latin-1', header=None, error_bad_lines=False)
    li.append(df)

data = pd.concat(li, axis=0, ignore_index=True)

data.columns=["ItemID","Sentiment","SentimentText","NaN"]
data.drop(['NaN'], axis=1, inplace=True)

data.to_csv('Combined-Twitter-Data.csv', sep='\t', encoding='utf-8', index=False)

In [74]:
data.head(10)

Unnamed: 0,ItemID,Sentiment,SentimentText
0,638060586258038784,neutral,05 Beat it - Michael Jackson - Thriller (25th ...
1,638061181823922176,positive,Jay Z joins Instagram with nostalgic tribute t...
2,638083821364244480,neutral,Michael Jackson: Bad 25th Anniversary Edition ...
3,638091450132078593,positive,I liked a @YouTube video http://t.co/AaR3pjp2P...
4,638125563790557184,positive,18th anniv of Princess Diana's death. I still ...
5,638130776727535617,positive,@oridaganjazz The 1st time I heard Michael Jac...
6,638134980862828544,neutral,'Michael Jackson' appeared on Saturday 29 at t...
7,638156605448695808,positive,Are you old enough to remember Michael Jackson...
8,638162155250954241,negative,@etbowser do u enjoy his 2nd rate Michael Jack...
9,638163324752891908,neutral,The Weeknd is the closest thing we may get to ...


In [48]:
# Loading Data

def ingest():
    data = pd.read_csv('./Data/Combined-Twitter-Data.csv', sep='\t', encoding='latin-1')
    data.columns=["ItemID","Sentiment","SentimentText"]
    data.drop(['ItemID'], axis=1, inplace=True)
    data.columns=["Sentiment","SentimentText"]
    mapping = {'neutral': 0, 'positive': 1, 'negative': -1} # convert strings into intergers as 1,0,-1
    data = data.replace({'Sentiment': mapping})
    data = data[data.Sentiment.isnull() == False]
    data = data[data['SentimentText'].isnull() == False]
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    print ('dataset loaded with shape', data.shape)  
    return data

data = ingest()

dataset loaded with shape (50132, 2)


In [49]:
data.head()

Unnamed: 0,Sentiment,SentimentText
0,0,05 Beat it - Michael Jackson - Thriller (25th ...
1,1,Jay Z joins Instagram with nostalgic tribute t...
2,0,Michael Jackson: Bad 25th Anniversary Edition ...
3,1,I liked a @YouTube video http://t.co/AaR3pjp2P...
4,1,18th anniv of Princess Diana's death. I still ...


In [50]:
# Processing Data

tokenizer = TweetTokenizer()
def tokenize(tweet):
    try:
        tweet = tweet.lower()
        tokens = tokenizer.tokenize(tweet)
        tokens = list(filter(lambda t: not t.startswith('@'), tokens))
        tokens = list(filter(lambda t: not t.startswith('#'), tokens))
        tokens = list(filter(lambda t: not t.startswith('http'), tokens))
        return tokens
    except:
        return 'NC'
    
def postprocess(data):
    data['tokens'] = data['SentimentText'].progress_map(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
    data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

data = postprocess(data)

progress-bar: 100%|██████████| 50132/50132 [00:03<00:00, 13897.00it/s]


In [51]:
# Labelise Data

LabeledSentence = gensim.models.doc2vec.LabeledSentence # we'll talk about this down below

def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

In [52]:
#Splitting for Training and Testing

x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(1000000).tokens),
                                                    np.array(data.head(1000000).Sentiment), test_size=0.2)

x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')

data_labellised= labelizeTweets(np.array(data.tokens), 'data')

  if __name__ == '__main__':
40105it [00:00, 143437.47it/s]
10027it [00:00, 250649.84it/s]
50132it [00:00, 155870.51it/s]


In [53]:
# Builidng Word2Vec Vocabulary and Training

n=1000000
n_dim = 200
tweet_w2v = Word2Vec(size=n_dim, min_count=10)
tweet_w2v.build_vocab([x.words for x in tqdm(data_labellised)])

100%|██████████| 50132/50132 [00:00<00:00, 2263218.57it/s]


In [54]:
tweet_w2v.train([x.words for x in tqdm(data_labellised)],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)

100%|██████████| 50132/50132 [00:00<00:00, 2732432.11it/s]
  """Entry point for launching an IPython kernel.


(3792331, 5425965)

In [55]:
#Save the w2v Model
tweet_w2v.save('w2vmodel')

#Load the w2v Model
new_w2vmodel = gensim.models.Word2Vec.load('w2vmodel')

In [56]:
# Plotting the Vectors

# defining the chart
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",
                        tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                        x_axis_type=None, y_axis_type=None, min_border=1)

# getting a list of word vectors. limit to 10000. each is of 200 dimensions
word_vectors = [tweet_w2v[w] for w in list(tweet_w2v.wv.vocab.keys())[:5000]]

# dimensionality reduction. converting the vectors to 2d vectors
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)

# putting everything in a dataframe
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = list(tweet_w2v.wv.vocab.keys())[:5000]

# plotting. the corresponding word appears when you hover on the data point.
plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)

  # Remove the CWD from sys.path while we load stuff.


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5000 samples in 0.026s...
[t-SNE] Computed neighbors for 5000 samples in 9.259s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5000
[t-SNE] Computed conditional probabilities for sample 2000 / 5000
[t-SNE] Computed conditional probabilities for sample 3000 / 5000
[t-SNE] Computed conditional probabilities for sample 4000 / 5000
[t-SNE] Computed conditional probabilities for sample 5000 / 5000
[t-SNE] Mean sigma: 0.177818
[t-SNE] KL divergence after 250 iterations with early exaggeration: 83.090424
[t-SNE] KL divergence after 1000 iterations: 2.251121


In [57]:
# TF-IDF matrix of data

print ('building tf-idf matrix ...')
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in data_labellised])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print ('vocab size :', len(tfidf))

building tf-idf matrix ...
vocab size : 6725


In [58]:
#Save the tfidf 
with open("tfidfdict.txt", "wb") as myFile:
    pickle.dump(tfidf, myFile)

with open("tfidfdict.txt", "rb") as myFile:
    tfidf = pickle.load(myFile)

In [59]:
# Build tweet vector to give input to FFNN

def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size)) * tfidf[word] #combining w2v vectors with tfidf value of words in the tweet.
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [60]:
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

  
40105it [00:08, 4652.26it/s]
  
10027it [00:02, 4444.17it/s]


In [61]:
# Training 3 layered FFNN

model = Sequential()
model.add(Dense(32, activation='relu', input_dim=200))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_vecs_w2v, y_train, epochs=100, batch_size=10000, verbose=2)

Epoch 1/100
 - 0s - loss: 0.5520 - accuracy: 0.4866
Epoch 2/100
 - 0s - loss: 0.4149 - accuracy: 0.5127
Epoch 3/100
 - 0s - loss: 0.3488 - accuracy: 0.5187
Epoch 4/100
 - 0s - loss: 0.3034 - accuracy: 0.5255
Epoch 5/100
 - 0s - loss: 0.2504 - accuracy: 0.5304
Epoch 6/100
 - 0s - loss: 0.2104 - accuracy: 0.5390
Epoch 7/100
 - 0s - loss: 0.1491 - accuracy: 0.5420
Epoch 8/100
 - 0s - loss: 0.0926 - accuracy: 0.5469
Epoch 9/100
 - 0s - loss: 0.0230 - accuracy: 0.5458
Epoch 10/100
 - 0s - loss: -2.0365e-02 - accuracy: 0.5513
Epoch 11/100
 - 0s - loss: -8.8434e-02 - accuracy: 0.5539
Epoch 12/100
 - 0s - loss: -1.6506e-01 - accuracy: 0.5563
Epoch 13/100
 - 0s - loss: -2.6402e-01 - accuracy: 0.5572
Epoch 14/100
 - 0s - loss: -3.5190e-01 - accuracy: 0.5583
Epoch 15/100
 - 0s - loss: -4.1093e-01 - accuracy: 0.5596
Epoch 16/100
 - 0s - loss: -5.0949e-01 - accuracy: 0.5591
Epoch 17/100
 - 0s - loss: -6.2698e-01 - accuracy: 0.5594
Epoch 18/100
 - 0s - loss: -7.2909e-01 - accuracy: 0.5617
Epoch 19/1

<keras.callbacks.callbacks.History at 0x1a36c3bb50>

In [62]:
# Evaluating accuracy score

score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
print(model.metrics_names[0],": ",score[0],"\n",model.metrics_names[1],": ",score[1])

loss :  -86.91042879598619 
 accuracy :  0.5304677486419678


In [63]:
# Saving model

#Saving the model
model_json = model.to_json() # serialize model to JSON
with open("model.json", "w") as json_file:
    json_file.write(model_json)

model.save_weights("smodel.h5") # serialize weights to HDF5
print("Saved model to disk")

Saved model to disk


In [64]:
# Loading the model

newmodel = model_from_json(open('model.json').read())
newmodel.load_weights('smodel.h5')

In [65]:
# Predicting for test file (Validation)

def ingesttest():
    testdata = pd.read_csv('./Data/gold/SemEval2017-task4-test.subtask-A.english.txt', sep='\t', encoding='latin-1')
    testdata.columns=["ItemID","Sentiment","SentimentText"]
    testdata.drop(['ItemID'], axis=1, inplace=True)
    testdata.columns=["Sentiment","SentimentText"]
    mapping = {'neutral': 0, 'positive': 1, 'negative': -1}
    testdata = testdata.replace({'Sentiment': mapping})
    testdata = testdata[testdata.Sentiment.isnull() == False]
    testdata = testdata[testdata['SentimentText'].isnull() == False]
    testdata.reset_index(inplace=True)
    testdata.drop('index', axis=1, inplace=True)
    print ('dataset loaded with shape', testdata.shape  )  
    return testdata

testdata = ingesttest()

dataset loaded with shape (11905, 2)


In [66]:
testdata = postprocess(testdata)
testdata.head(5)

progress-bar: 100%|██████████| 11905/11905 [00:00<00:00, 17095.19it/s]


Unnamed: 0,Sentiment,SentimentText,tokens
0,1,Ariana Grande KIIS FM Yours Truly CD listening...,"[ariana, grande, kiis, fm, yours, truly, cd, l..."
1,1,Ariana Grande White House Easter Egg Roll in W...,"[ariana, grande, white, house, easter, egg, ro..."
2,1,#CD #Musics Ariana Grande Sweet Like Candy 3.4...,"[ariana, grande, sweet, like, candy, 3.4, oz, ..."
3,0,SIDE TO SIDE ð @arianagrande #sidetoside #a...,"[side, to, side, ð, , , , , , , , ¦]"
4,1,Hairspray Live! Previews at the Macy's Thanksg...,"[hairspray, live, !, previews, at, the, macy's..."


In [67]:
test_X=np.array(testdata.tokens)
test_y=np.array(testdata.Sentiment)

In [68]:
test_w2v_vecs = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x:x, test_X))])

  
11905it [00:01, 6750.17it/s]


In [69]:
test_w2v_vecs.shape

(11905, 200)

In [70]:
# Model Prediction 

score = model.evaluate(test_w2v_vecs,test_y, batch_size=128, verbose=2)
print(model.metrics_names[0],": ",score[0],"\n",model.metrics_names[1],": ",score[1])

loss :  -285.31915347653245 
 accuracy :  0.5002099871635437
