# Tutorial on building Deep learning models using Keras toolkit

In [1]:
#read input
f=open('Dataset/sentiment.txt','r')
lines=f.readlines()
f.close()
print (len(lines))


1143


In [2]:
import re
import numpy as np

stopwords = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount",  "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as",  "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"]

def text_cleaner(text): 
    text=remove_link(text.lower())
    long_words=[]
    for i in text.split():
        if i not in stopwords:                  
            long_words.append(i)
    return long_words

def remove_link(text):
    regex = r'https?://[^\s<>)"‘’]+'
    match = re.sub(regex,' ', text)
    regex = r'https?:|urls?|[/\:,-."\'?!;…]+'
    tweet = re.sub(regex,' ', match)
    tweet = re.sub("[^a-zA-Z_]", " ", tweet)
    tweet = re.sub("[ ]+", " ", tweet) 
    return tweet


**Cross validation**

In [3]:
#divide the data into train set and test set
import random
random.seed(1)
random.shuffle(lines) #shuffle the dataset before dividing it into train and test set
split_size = int(0.8*len(lines)) #use 80% of total data as train set and 20% as test set
train_lines = lines[:split_size]
test_lines = lines[split_size:]

print ("Training set size : ", len(train_lines))
print ("Test set size : ", len(test_lines))


Training set size :  914
Test set size :  229


In [4]:
#convert string tokens to integers
#create a vocabulary set and assign a unique id to each word in the vocabulary

#load all unique vocabulary
vocab = []
maxlen = []
for l in train_lines:
    s = l.strip().split('\t')
    words = text_cleaner(s[0].strip())
    vocab += words
    maxlen.append(len(words))
vocab = list(set(vocab))
print ("Vocabulary size : ", len(vocab))

#assign unique id to each vocabulary
word2id = dict()
for i,v in enumerate(vocab,1):
    word2id[v] = i
word2id['PAD'] = 0 #special token to take care of unseen words in the test set
maxlen = max(maxlen)
print ("Max sentence length : ",maxlen)


Vocabulary size :  4012
Max sentence length :  19


In [5]:
#Prepare train and test set
#Convert strings to integers
#prepare train and test set

#prepare train set
import numpy as np
train_X = []
train_Y = []
for l in train_lines:
    s = l.strip().split('\t')
    text = text_cleaner(s[0])
    label = int(s[1].strip())
    temp_x = [word2id[x] for x in text]
    temp_x += [0]* (maxlen-len(temp_x)) #convert all input to equal size to enable training in batches
    temp_y = [0]*2
    temp_y[label] = 1
    train_X.append(temp_x)
    train_Y.append(temp_y)
print (len(train_X))
print (len(train_Y))
print (train_X[0])
print (train_Y[0])
train_X = np.array(train_X)
train_Y = np.array(train_Y) 

914
914
[3218, 3130, 3341, 1089, 423, 2087, 2087, 3025, 1384, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 0]


In [6]:
#prepare test set
test_X = []
test_Y = []

for l in test_lines:
    s = l.strip().split('\t')
    text = text_cleaner(s[0].strip())
    label = int(s[1].strip())
    temp_x = [word2id[x] if x in word2id else 0 for x in text ][:maxlen]
    temp_x += [0]* (maxlen-len(temp_x)) #convert all input to equal size to enable training in batches
    temp_y = [0]*2
    temp_y[label] = 1
    test_X.append(temp_x)
    test_Y.append(temp_y)
print (len(test_X))
print (len(test_Y))
print (test_X[0])
print (test_Y[0])
test_X = np.array(test_X)
test_Y = np.array(test_Y)

229
229
[1060, 3396, 2791, 3651, 0, 0, 837, 28, 2134, 1384, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 0]


In [7]:
from keras.models import Sequential #Sequential is the class in Keras library that defines a model comprising of linear stack of models
from keras.layers import Dense
##initialize a sequential model
model = Sequential() 

#Add layers to the sequential model
#For the first hidden layer, input_dim must be mentioned which is the dimension of the input that the hidden layer receives from the input layer
#The below line adds a hidden layer of 50 nodes that is connected to a input layer with maxlen nodes
#tanh activation function applies nonlinear transformation to the output of the hidden layer
model.add(Dense(50,input_dim=maxlen,activation='tanh')) 

#Add output layer with two nodes since our dataset comprises of two classes
#Softmax converts the output to a probability distribution.
model.add(Dense(2,activation='softmax'))

#Compile the model by specifying the optimizers and loss functions
model.compile(optimizer='sgd', loss='categorical_crossentropy',metrics=['accuracy'])

#train the model
model.fit(train_X, train_Y, epochs=20, batch_size=64)

Using TensorFlow backend.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x7f8f63dd5590>

In [8]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 50)                1000      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 102       
Total params: 1,102
Trainable params: 1,102
Non-trainable params: 0
_________________________________________________________________


In [9]:
res = model.evaluate(test_X,test_Y)



In [10]:
res

[0.7072132375563076, 0.5938864350318909]

In [11]:
p = model.predict(test_X)

In [12]:
for i,l in enumerate(test_lines[:5]):
  print (l.strip(),p[i])

We blame cities for the majority of CO2 emissions without acknowledging their vulnerability to #CFCC15 #journey2015 #S2228 #SemST	0 [0.81120926 0.18879077]
Feminists who go for a gender studies degree should also blame the patriarchy for their mediocre grades in science. #SemST	0 [0.6729641  0.32703587]
Just wrote my blog to help @CalAlimony pass a vital law that ends #alimony. Posting soon. #Divorce #leanin #SemST	1 [0.90414804 0.09585188]
RT @JohnFugelsang: They should just make the GOP primaries a reality game show called "Who Wants To Get Beat Up By A Girl? #SemST	0 [0.5836141  0.41638583]
It's incredibly easy to identify shitty females with a poor view on the world and what's important thanks to #SemST	0 [0.58727384 0.4127261 ]


In [13]:
#Generate word2vec embeddings using gensim
import gensim, logging, os
from gensim import corpora
from collections import defaultdict
from pprint import pprint
import numpy as np

import time
t1 = time.time()
texts = [[word for word in text_cleaner(line.strip().split('\t')[0])]  for line in open('Dataset/sentiment.txt')]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
model = gensim.models.Word2Vec(texts, min_count=0,sample=0.001, seed=1, workers=8, min_alpha=0.0001, sg=0, hs=0, negative=5,iter=100,size=50)


In [14]:
##word2vec model trained
##Now dump the generated embeddings into text files
f2=open('Embeddings/words.txt','w')
X = []
for key, value in dictionary.token2id.items() :
    #print (key, value)
    f2.write(key+"\n")
    X.append(list(model[key]))
f2.close()

fp=open('Embeddings/vectors.txt','w')
for w in list(X):
    x=list(w)
    for wi in x:
        fp.write(str(wi).replace('\n','')+'\t')
    fp.write('\n')
fp.close()
t2 = time.time()
print ("Total time taken ",t2-t1)

  


Total time taken  4.447230100631714


In [15]:
##train a network using the generated embeddings as features
#first load the embeddings from the files into a matrix
f = open('Embeddings/words.txt')
words = f.readlines()
f.close()
f = open('Embeddings/vectors.txt')
vectors = f.readlines()
f.close()

embeddings = dict()
for i,w in enumerate(words):
  embeddings[w.strip()] = np.array([float(x) for x in vectors[i].strip().split('\t')])
embedding_size = len(vectors[i].strip().split('\t'))


In [16]:
#for each sentence in the training set, generate sentence embeddings by averaging word embeddings
train_X = []
train_Y = []
for l in train_lines:
    s = l.lower().strip().split('\t')
    text = text_cleaner(s[0].strip())
    label = int(s[1].strip())
    temp_x = np.array([0]*embedding_size)
    #represent each sentence by a sentence embedding obtained by averaging word embeddings
    for v in text:
        if v in embeddings:
            temp_x = np.add(temp_x,embeddings[v])
        else:
            temp_x = np.add(temp_x,np.array([0]*embedding_size))
    temp_x = np.true_divide(temp_x,len(text))
    
    #represent the labels as one-hot vectors
    temp_y = [0]*2
    temp_y[label] = 1
    
    train_X.append(temp_x)
    train_Y.append(temp_y)
    
print (len(train_X))
print (len(train_Y))
print (train_X[1])
print (train_Y[1])
train_X = np.array(train_X)
train_Y = np.array(train_Y)


914
914
[-1.82360578  0.34138749  0.29822487 -0.75997798  0.83217859  0.601778
  0.99570207  0.13635753  0.61127107  0.02886601  0.83160091  0.517771
 -0.73988049 -0.01649273  0.25642619 -0.13726492  0.55010723 -1.24789482
  1.08451814 -0.2062002  -0.50446282  0.17046533 -0.27019463  0.18842409
 -0.70414968 -0.6173791   1.05704221 -0.9409136  -0.08095331  0.557665
  0.54538365 -0.21563965  0.05484496 -0.27321648 -0.45208534 -0.01071257
  1.30831528  0.52625068  0.13732151 -0.21848837  0.51412899  0.54774888
 -0.29085673 -0.94079209  1.30343695  0.01587903  1.73652809 -0.67173743
  0.70545936 -1.6212308 ]
[1, 0]


In [17]:
#Similarly represent the test set using word embeddings
test_X = []
test_Y = []
for l in test_lines:
    s = l.lower().strip().split('\t')
    text = text_cleaner(s[0].strip())
    label = int(s[1].strip())
    temp_x = np.array([0]*embedding_size)
    #represent each sentence by a sentence embedding obtained by averaging word embeddings
    for v in text:
        if v in embeddings:
            temp_x = np.add(temp_x,embeddings[v])
        else:
            temp_x = np.add(temp_x,np.array([0]*embedding_size))
    temp_x = np.true_divide(temp_x,len(text))
    
    #represent the labels as one-hot vectors
    temp_y = [0]*2
    temp_y[label] = 1
    
    test_X.append(temp_x)
    test_Y.append(temp_y)
    
print (len(test_X))
print (len(test_Y))
print (test_X[1])
print (test_Y[1])
test_X = np.array(test_X)
test_Y = np.array(test_Y)


229
229
[-0.7830701  -0.02735568  0.12761055 -0.10032103  0.16476611  0.83002078
  0.16052663 -0.1633243   0.62122573  0.42775442  0.20475679  0.6044208
 -0.14724428 -0.04233229  0.07015692  0.05374405 -0.05922202 -0.13086578
  0.12168479  0.22908168 -0.07538843  0.43966649 -0.3635379  -0.09805278
 -0.3774895  -0.03848179  0.2325461   0.05886424 -0.00449832 -0.2243597
  0.69115194 -0.18041525 -0.44948333  0.35550103  0.04634367  0.03371075
  0.12027452  0.49152738 -0.14274248  0.01306736 -0.06210457  0.00196507
  0.3891153  -0.6353913   0.71120173  0.2722335   0.68436854 -0.44881822
  0.56755404 -0.73409353]
[1, 0]


# Define a model and train the model

In [18]:

model = Sequential() 
model.add(Dense(50,input_dim=embedding_size,activation='tanh')) 
model.add(Dense(2,activation='softmax'))
model.compile(optimizer='sgd', loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(train_X, train_Y, epochs=20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x7f8ee6232890>

In [19]:
res = model.evaluate(test_X,test_Y)



In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestCentroid
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

In [21]:
#for each sentence in the training set, generate sentence embeddings by averaging word embeddings
train_X = []
train_Y = []
for l in train_lines:
    s = l.lower().strip().split('\t')
    text = text_cleaner(s[0].strip())
    label = int(s[1].strip())
    temp_x = np.array([0]*embedding_size)
    #represent each sentence by a sentence embedding obtained by averaging word embeddings
    for v in text:
        if v in embeddings:
            temp_x = np.add(temp_x,embeddings[v])
        else:
            temp_x = np.add(temp_x,np.array([0]*embedding_size))
    temp_x = np.true_divide(temp_x,len(text))
    
    #represent the labels as one-hot vectors
    train_X.append(temp_x)
    train_Y.append(label)
    
print (len(train_X))
print (len(train_Y))
print (train_X[1])
print (train_Y[1])
train_X = np.array(train_X)
train_Y = np.array(train_Y)

914
914
[-1.82360578  0.34138749  0.29822487 -0.75997798  0.83217859  0.601778
  0.99570207  0.13635753  0.61127107  0.02886601  0.83160091  0.517771
 -0.73988049 -0.01649273  0.25642619 -0.13726492  0.55010723 -1.24789482
  1.08451814 -0.2062002  -0.50446282  0.17046533 -0.27019463  0.18842409
 -0.70414968 -0.6173791   1.05704221 -0.9409136  -0.08095331  0.557665
  0.54538365 -0.21563965  0.05484496 -0.27321648 -0.45208534 -0.01071257
  1.30831528  0.52625068  0.13732151 -0.21848837  0.51412899  0.54774888
 -0.29085673 -0.94079209  1.30343695  0.01587903  1.73652809 -0.67173743
  0.70545936 -1.6212308 ]
0


In [22]:
#Similarly represent the test set using word embeddings
test_X = []
test_Y = []
for l in test_lines:
    s = l.lower().strip().split('\t')
    text = text_cleaner(s[0].strip())
    label = int(s[1].strip())
    temp_x = np.array([0]*embedding_size)
    #represent each sentence by a sentence embedding obtained by averaging word embeddings
    for v in text:
        if v in embeddings:
            temp_x = np.add(temp_x,embeddings[v])
        else:
            temp_x = np.add(temp_x,np.array([0]*embedding_size))
    temp_x = np.true_divide(temp_x,len(text))
    
    #represent the labels as one-hot vectors
    
    
    test_X.append(temp_x)
    test_Y.append(label)
    
print (len(test_X))
print (len(test_Y))
print (test_X[1])
print (test_Y[1])
test_X = np.array(test_X)
test_Y = np.array(test_Y)

229
229
[-0.7830701  -0.02735568  0.12761055 -0.10032103  0.16476611  0.83002078
  0.16052663 -0.1633243   0.62122573  0.42775442  0.20475679  0.6044208
 -0.14724428 -0.04233229  0.07015692  0.05374405 -0.05922202 -0.13086578
  0.12168479  0.22908168 -0.07538843  0.43966649 -0.3635379  -0.09805278
 -0.3774895  -0.03848179  0.2325461   0.05886424 -0.00449832 -0.2243597
  0.69115194 -0.18041525 -0.44948333  0.35550103  0.04634367  0.03371075
  0.12027452  0.49152738 -0.14274248  0.01306736 -0.06210457  0.00196507
  0.3891153  -0.6353913   0.71120173  0.2722335   0.68436854 -0.44881822
  0.56755404 -0.73409353]
0


In [23]:
gnb= GaussianNB()
model = gnb.fit(train_X,train_Y)

In [24]:
y = model.predict(test_X)

In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [26]:
print("Accuracy ",accuracy_score(test_Y, y, normalize = True))

Accuracy  0.74235807860262


In [27]:
y

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0])

In [28]:
svml=svm.LinearSVC()

In [29]:
model = svml.fit(train_X,train_Y)



In [30]:
y = model.predict(test_X)

In [31]:
print("Accuracy ",accuracy_score(test_Y, y, normalize = True))

Accuracy  0.6812227074235808
