In [1]:
#read input
f=open('Dataset/sentiment.txt','r')
lines=f.readlines()
f.close()
print (len(lines))


1143


In [2]:
import re
import numpy as np

stopwords = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount",  "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as",  "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"]

def text_cleaner(text): 
    text=remove_link(text.lower())
    long_words=[]
    for i in text.split():
        if i not in stopwords:                  
            long_words.append(i)
    return long_words

def remove_link(text):
    regex = r'https?://[^\s<>)"‘’]+'
    match = re.sub(regex,' ', text)
    regex = r'https?:|urls?|[/\:,-."\'?!;…]+'
    tweet = re.sub(regex,' ', match)
    tweet = re.sub("[^a-zA-Z_]", " ", tweet)
    tweet = re.sub("[ ]+", " ", tweet) 
    return tweet


In [3]:
#divide the data into train set and test set
import random
random.seed(1)
random.shuffle(lines) #shuffle the dataset before dividing it into train and test set
split_size = int(0.8*len(lines)) #use 80% of total data as train set and 20% as test set
train_lines = lines[:split_size]
test_lines = lines[split_size:]

print ("Training set size : ", len(train_lines))
print ("Test set size : ", len(test_lines))


Training set size :  914
Test set size :  229


In [4]:
#convert string tokens to integers
#create a vocabulary set and assign a unique id to each word in the vocabulary

#load all unique vocabulary
vocab = []
maxlen = []
for l in train_lines:
    s = l.strip().split('\t')
    words = text_cleaner(s[0].strip())
    vocab += words
    maxlen.append(len(words))
vocab = list(set(vocab))
print ("Vocabulary size : ", len(vocab))

#assign unique id to each vocabulary
word2id = dict()
for i,v in enumerate(vocab,1):
    word2id[v] = i
word2id['PAD'] = 0 #special token to take care of unseen words in the test set
maxlen = max(maxlen)
print ("Max sentence length : ",maxlen)


Vocabulary size :  4012
Max sentence length :  19


In [5]:
#Prepare train and test set
#Convert strings to integers
#prepare train and test set

#prepare train set
import numpy as np
train_X = []
train_Y = []
for l in train_lines:
    s = l.strip().split('\t')
    text = text_cleaner(s[0])
    label = int(s[1].strip())
    temp_x = [word2id[x] for x in text]
    temp_x += [0]* (maxlen-len(temp_x)) #convert all input to equal size to enable training in batches
    temp_y = [0]*2
    temp_y[label] = 1
    train_X.append(temp_x)
    train_Y.append(temp_y)
print (len(train_X))
print (len(train_Y))
print (train_X[0])
print (train_Y[0])
train_X = np.array(train_X)
train_Y = np.array(train_Y)


914
914
[1618, 190, 1232, 3502, 254, 1111, 1111, 776, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 0]


In [6]:
train_Y.shape

(914, 2)

In [7]:
#prepare test set
test_X = []
test_Y = []

for l in test_lines:
    s = l.strip().split('\t')
    text = text_cleaner(s[0].strip())
    label = int(s[1].strip())
    temp_x = [word2id[x] if x in word2id else 0 for x in text ][:maxlen]
    temp_x += [0]* (maxlen-len(temp_x)) #convert all input to equal size to enable training in batches
    temp_y = [0]*2
    temp_y[label] = 1
    test_X.append(temp_x)
    test_Y.append(temp_y)
print (len(test_X))
print (len(test_Y))
print (test_X[0])
print (test_Y[0])
test_X = np.array(test_X)
test_Y = np.array(test_Y)

229
229
[3821, 878, 3871, 1615, 0, 0, 3940, 944, 890, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 0]


In [8]:
len(train_X[0])

19

In [9]:
from keras.models import Sequential #Sequential is the class in Keras library that defines a model comprising of linear stack of models
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import LSTM, Bidirectional,SimpleRNN

input_tweet=len(train_X[0])
input_node=Input(shape=(input_tweet,))
emb=60
lstmnode=64

Using TensorFlow backend.


In [10]:
def CNN(input_node,max_len):
    encode=Embedding(len(vocab)+2, emb, input_length=max_len, trainable=True)(input_node)    #,weights=[vectors],trainable=False
    encode=Conv1D(128,
                         3,
                         padding='same',
                         activation='relu',
                         strides=1)(encode)
    encode=GlobalMaxPooling1D()(encode)
    return encode

cnn_out=CNN(input_node,input_tweet)
senti_class=Dense(2,activation='softmax')(cnn_out)
model=Model(input_node,senti_class)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_Y,
          batch_size=512,
          epochs=10)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7fdb82502f90>

In [11]:
res = model.evaluate(test_X,test_Y)
res



[0.6339120339097934, 0.6462882161140442]

In [12]:
p = model.predict(test_X)
for i,l in enumerate(test_lines[:5]):
  print (l.strip(),p[i])

We blame cities for the majority of CO2 emissions without acknowledging their vulnerability to #CFCC15 #journey2015 #S2228 #SemST	0 [0.7098524  0.29014763]
Feminists who go for a gender studies degree should also blame the patriarchy for their mediocre grades in science. #SemST	0 [0.7279925  0.27200747]
Just wrote my blog to help @CalAlimony pass a vital law that ends #alimony. Posting soon. #Divorce #leanin #SemST	1 [0.7111126  0.28888738]
RT @JohnFugelsang: They should just make the GOP primaries a reality game show called "Who Wants To Get Beat Up By A Girl? #SemST	0 [0.72295177 0.2770483 ]
It's incredibly easy to identify shitty females with a poor view on the world and what's important thanks to #SemST	0 [0.71841705 0.28158298]


In [13]:
def RNN(input_node,max_len):
    encode=Embedding(len(vocab)+2, emb, input_length=max_len, trainable=True)(input_node)   
    encoder_RNN=SimpleRNN(lstmnode,activation='tanh')(encode)
    return encoder_RNN

rnn_out=RNN(input_node,input_tweet)
senti_class=Dense(2,activation='softmax')(rnn_out)
model=Model(input_node,senti_class)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_Y,
          batch_size=512,
          epochs=10)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7fdb707d7190>

In [14]:
def f_LSTM(input_node,max_len):
    encode=Embedding(len(vocab)+2, emb, input_length=max_len, trainable=True)(input_node)   
    encoder_LSTM=LSTM(lstmnode,activation='tanh')(encode)
    return encoder_LSTM

LSTM_out=f_LSTM(input_node,input_tweet)
senti_class=Dense(2,activation='softmax')(LSTM_out)
model=Model(input_node,senti_class)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_Y,
          batch_size=512,
          epochs=10)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7fdb701df090>

In [15]:
res = model.evaluate(test_X,test_Y)
res



[0.6610513730340649, 0.6462882161140442]

In [16]:
p = model.predict(test_X)
for i,l in enumerate(test_lines[:5]):
  print (l.strip(),p[i])

We blame cities for the majority of CO2 emissions without acknowledging their vulnerability to #CFCC15 #journey2015 #S2228 #SemST	0 [0.7246813  0.27531865]
Feminists who go for a gender studies degree should also blame the patriarchy for their mediocre grades in science. #SemST	0 [0.7384692 0.2615308]
Just wrote my blog to help @CalAlimony pass a vital law that ends #alimony. Posting soon. #Divorce #leanin #SemST	1 [0.7358404 0.2641597]
RT @JohnFugelsang: They should just make the GOP primaries a reality game show called "Who Wants To Get Beat Up By A Girl? #SemST	0 [0.7208876  0.27911237]
It's incredibly easy to identify shitty females with a poor view on the world and what's important thanks to #SemST	0 [0.7100063  0.28999373]


In [17]:
def BiLSTM(input_node,max_len):
    encode=Embedding(len(vocab)+2, emb, input_length=max_len, trainable=True)(input_node)   
    encoder_BiLSTM=Bidirectional(LSTM(lstmnode,activation='tanh'))(encode)
    return encoder_BiLSTM

BiLSTM_out=BiLSTM(input_node,input_tweet)
senti_class=Dense(2,activation='softmax')(BiLSTM_out)
model=Model(input_node,senti_class)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_Y,
          batch_size=512,
          epochs=10)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7fdb584fbd90>

In [18]:
res = model.evaluate(test_X,test_Y)
res



[0.6337413311525203, 0.6462882161140442]

In [19]:
p = model.predict(test_X)
for i,l in enumerate(test_lines[:5]):
  print (l.strip(),p[i])

We blame cities for the majority of CO2 emissions without acknowledging their vulnerability to #CFCC15 #journey2015 #S2228 #SemST	0 [0.71733654 0.28266352]
Feminists who go for a gender studies degree should also blame the patriarchy for their mediocre grades in science. #SemST	0 [0.7383245  0.26167548]
Just wrote my blog to help @CalAlimony pass a vital law that ends #alimony. Posting soon. #Divorce #leanin #SemST	1 [0.6997612 0.3002388]
RT @JohnFugelsang: They should just make the GOP primaries a reality game show called "Who Wants To Get Beat Up By A Girl? #SemST	0 [0.7068976  0.29310232]
It's incredibly easy to identify shitty females with a poor view on the world and what's important thanks to #SemST	0 [0.70399135 0.2960087 ]
