## **Part I: Building deeplearning models using Keras toolkit**

**PreProcessing Modules**

In [1]:
#read input
f=open('Dataset/sentiment.txt','r')
lines=f.readlines()
f.close()
print (len(lines))

1143


In [2]:
import re
import numpy as np

stopwords = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount",  "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as",  "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"]

def text_cleaner(text): 
    text=remove_link(text.lower())
    long_words=[]
    for i in text.split():
        if i not in stopwords:                  
            long_words.append(i)
    return long_words

def remove_link(text):
    regex = r'https?://[^\s<>)"‘’]+'
    match = re.sub(regex,' ', text)
    regex = r'https?:|urls?|[/\:,-."\'?!;…]+'
    tweet = re.sub(regex,' ', match)
    tweet = re.sub("[^a-zA-Z_]", " ", tweet)
    tweet = re.sub("[ ]+", " ", tweet) 
    return tweet


In [3]:
#divide the data into train set and test set
import random
random.seed(1)
random.shuffle(lines) #shuffle the dataset before dividing it into train and test set
split_size = int(0.8*len(lines)) #use 80% of total data as train set and 20% as test set
train_lines = lines[:split_size]
test_lines = lines[split_size:]

print ("Training set size : ", len(train_lines))
print ("Test set size : ", len(test_lines))

Training set size :  914
Test set size :  229


In [4]:
#convert string tokens to integers
#create a vocabulary set and assign a unique id to each word in the vocabulary

#load all unique vocabulary
vocab = []
maxlen = []
for l in train_lines:
    s = l.strip().split('\t')
    words = text_cleaner(s[0].strip())
    vocab += words
    maxlen.append(len(words))
vocab = list(set(vocab))
print ("Vocabulary size : ", len(vocab))

#assign unique id to each vocabulary
word2id = dict()
id2word = dict()
for i,v in enumerate(vocab,1):
    word2id[v] = i
    id2word[i] = v 
word2id['PAD'] = 0 #special token to take care of unseen words in the test set
id2word[0] = 'PAD'
maxlen = max(maxlen)
print ("Max sentence length : ",maxlen)

Vocabulary size :  4012
Max sentence length :  19


In [5]:
#Prepare train and test set
#Convert strings to integers
#prepare train and test set

#prepare train set
import numpy as np
train_X = []
train_Y = []
for l in train_lines:
    s = l.strip().split('\t')
    text = text_cleaner(s[0])
    label = int(s[1].strip())
    temp_x = [word2id[x] for x in text]
    temp_x += [0]* (maxlen-len(temp_x)) #convert all input to equal size to enable training in batches
    temp_y = [0]*2
    temp_y[label] = 1
    train_X.append(temp_x)
    train_Y.append(temp_y)
print (len(train_X))
print (len(train_Y))
print (train_X[0])
print (train_Y[:5])
train_X = np.array(train_X)
train_Y = np.array(train_Y) 

914
914
[1924, 467, 506, 337, 1081, 2333, 2333, 1676, 464, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0]]


In [6]:
test_X = []
test_Y = []

for l in test_lines:
    s = l.strip().split('\t')
    text = text_cleaner(s[0].strip())
    label = int(s[1].strip())
    temp_x = [word2id[x] if x in word2id else 0 for x in text ][:maxlen]
    temp_x += [0]* (maxlen-len(temp_x)) #convert all input to equal size to enable training in batches
    temp_y = [0]*2
    temp_y[label] = 1
    test_X.append(temp_x)
    test_Y.append(temp_y)
print (len(test_X))
print (len(test_Y))
print (test_X[0])
print (test_Y[0])
test_X = np.array(test_X)
test_Y = np.array(test_Y)

229
229
[2011, 3636, 3333, 657, 0, 0, 654, 81, 1121, 464, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 0]


## Part I: Building deeplearning models using Keras toolkit

In [8]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import LSTM, Bidirectional,SimpleRNN

Using TensorFlow backend.


Text Classification Using CNN

In [9]:
embedding_size = 50
input=Input(shape=(maxlen,))
embedding = Embedding(len(word2id), embedding_size, input_length=maxlen, trainable=True)(input)
convolution = Conv1D(128,3,padding='same',activation='relu',strides=1)(embedding)
pooling = GlobalMaxPooling1D()(convolution)
output = Dense(2,activation='softmax')(pooling)
model=Model(input,output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_Y,batch_size=512,epochs=10)



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7fc8244475d0>

In [10]:
res = model.evaluate(test_X,test_Y)




In [11]:
p = model.predict(test_X)
for i,l in enumerate(test_lines[:5]):
  print (l.strip(),p[i])

We blame cities for the majority of CO2 emissions without acknowledging their vulnerability to #CFCC15 #journey2015 #S2228 #SemST	0 [0.70769495 0.29230514]
Feminists who go for a gender studies degree should also blame the patriarchy for their mediocre grades in science. #SemST	0 [0.7182329 0.2817671]
Just wrote my blog to help @CalAlimony pass a vital law that ends #alimony. Posting soon. #Divorce #leanin #SemST	1 [0.68872374 0.31127623]
RT @JohnFugelsang: They should just make the GOP primaries a reality game show called "Who Wants To Get Beat Up By A Girl? #SemST	0 [0.71024686 0.28975317]
It's incredibly easy to identify shitty females with a poor view on the world and what's important thanks to #SemST	0 [0.7099096  0.29009038]


Loading Pre-trained Embeddings

In [12]:
f = open('Embeddings/words.txt')
words = f.readlines()
f.close()
f = open('Embeddings/vectors.txt')
vectors = f.readlines()
f.close()

embeddings = dict()
for i,w in enumerate(words):
  embeddings[w.strip()] = np.array([float(x) for x in vectors[i].strip().split('\t')])
embedding_size = len(vectors[i].strip().split('\t'))


In [13]:
embeddings_matrix = []
for i in range(len(word2id)):
  if id2word[i] in embeddings:
    embeddings_matrix.append(embeddings[id2word[i]])
  else:
    embeddings_matrix.append(np.array([0.0]*embedding_size))
embeddings_matrix = np.array(embeddings_matrix)

Text Classification using CNN and pre-trained embeddings

In [14]:
embedding_size = 50
input=Input(shape=(maxlen,))
embedding = Embedding(len(word2id), embedding_size, input_length=maxlen, weights = [embeddings_matrix],trainable=True)(input)
convolution = Conv1D(128,3,padding='same',activation='relu',strides=1)(embedding)
pooling = GlobalMaxPooling1D()(convolution)
output = Dense(2,activation='softmax')(pooling)
model=Model(input,output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_Y,batch_size=512,epochs=10)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7fc81c6d7850>

In [15]:
res = model.evaluate(test_X,test_Y)



Text Classification using Vanilla RNN

In [16]:
embedding_size = 50
input=Input(shape=(maxlen,))
embedding = Embedding(len(word2id), embedding_size, input_length=maxlen, weights = [embeddings_matrix],trainable=True)(input)
rnn = SimpleRNN(64,activation='tanh')(embedding)
output = Dense(2,activation='softmax')(rnn)
model=Model(input,output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_Y,batch_size=512,epochs=10)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7fc8242b91d0>

In [17]:
res = model.evaluate(test_X,test_Y)



Text Classification Using LSTM

In [18]:
embedding_size = 50
input=Input(shape=(maxlen,))
embedding = Embedding(len(word2id), embedding_size, input_length=maxlen, weights = [embeddings_matrix],trainable=True)(input)
rnn = LSTM(64,activation='tanh')(embedding)
output = Dense(2,activation='softmax')(rnn)
model=Model(input,output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_Y,batch_size=512,epochs=10)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7fc7fc473a10>

In [19]:
res = model.evaluate(test_X,test_Y)



Text Classification using Bidirectional LSTM


In [20]:
embedding_size = 50
input=Input(shape=(maxlen,))
embedding = Embedding(len(word2id), embedding_size, input_length=maxlen, weights = [embeddings_matrix],trainable=True)(input)
rnn = Bidirectional(LSTM(64,activation='tanh'))(embedding)
output = Dense(2,activation='softmax')(rnn)
model=Model(input,output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_Y,batch_size=512,epochs=10)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7fc7fc076f50>

In [21]:
res = model.evaluate(test_X,test_Y)



# **Part II: Sequential Tagging**

In [22]:
fp=open('Dataset/seq_tagging_dataset.txt')
lines = fp.readlines()
fp.close()
print (len(lines))

100


In [23]:
split_size = int(0.8*len(lines)) 
train_lines = lines[:split_size]
test_lines = lines[split_size:]
print (len(test_lines))
print (len(train_lines))

20
80


Pre-processing modules for sequence tagging


In [24]:
#assign unique ids to words and labels

#first load all vocabulary and labels
vocab = []
classes = []
maxlen = []
for l in train_lines:
  s = l.strip().split()
  words = [w.strip().split('/')[0] for w in s]
  labels = [w.strip().split('/')[1] for w in s]
  vocab += words
  classes += labels
  maxlen.append(len(words))
vocab = list(set(vocab))
classes = list(set(classes))
maxlen = len(maxlen)
print ("Vocabulary size : ", len(vocab))
print ("Number of classes : ", len(classes))
print ("Max sentence length : ", maxlen)
maxlen = 10
print ("Set max length to 10", maxlen)

Vocabulary size :  1032
Number of classes :  6
Max sentence length :  80
Set max length to 10 10


In [25]:
#assin unique id to each word
word2id = dict()
id2word = dict()
for i,v in enumerate(vocab,1):
  word2id[v] = i
  id2word[i] = v
word2id['PAD'] = 0
id2word[0] = 'PAD'

In [26]:
print ("Dictionary size : ", len(word2id))

Dictionary size :  1033


In [27]:
#assign unique id to each class
class2id = dict()
id2class = dict()
for i,c in enumerate(classes):
  class2id[c] = i
  id2class[i] = c
print ("Class dictionary size : ", len(class2id))

Class dictionary size :  6


In [28]:
#prepare training and test data
#input : list of tokens
#output : lits of labels corresponding to each token

#training data preparation
train_X = []
train_Y = []
for l in train_lines:
  s = l.strip().split()
  words = [w.strip().split('/')[0] for w in s]
  labels = [w.strip().split('/')[1] for w in s]
  words = words[:maxlen]
  labels = labels[:maxlen]
  temp_x = []
  temp_y = []
  temp_x = [word2id[w] if w in word2id else 0 for w in words]
  temp_y = []
  for label in labels:
    y = [0]*len(class2id)
    y[class2id[label]] = 1
    temp_y.append(y)
  padlen = maxlen - len(words)
  temp_x += [0]*padlen
  temp_y += ([[0]*len(class2id)])*padlen
  train_X.append(temp_x)
  train_Y.append(temp_y)
print (len(train_X),len(train_Y))
train_X = np.array(train_X)
train_Y = np.array(train_Y)


80 80


In [29]:
#test data preparation
test_X = []
test_Y = []
for l in test_lines:
  s = l.strip().split()
  words = [w.strip().split('/')[0] for w in s]
  labels = [w.strip().split('/')[1] for w in s]
  words = words[:maxlen]
  labels = labels[:maxlen]
  temp_x = []
  temp_y = []
  temp_x = [word2id[w] if w in word2id else 0 for w in words]
  temp_y = []
  for label in labels:
    y = [0]*len(class2id)
    y[class2id[label]] = 1
    temp_y.append(y)
  padlen = maxlen - len(words)
  temp_x += [0]*padlen
  temp_y += ([[0]*len(class2id)])*padlen
  test_X.append(temp_x)
  test_Y.append(temp_y)
print (len(test_X),len(test_Y))
print (test_X[0],test_Y[0])
test_X = np.array(test_X)
test_Y = np.array(test_Y)


20 20
[0, 0, 154, 0, 445, 0, 198, 0, 0, 0] [[0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1]]


Defining and Training a Sequential Tagging Model

In [30]:
##Define a model
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
input = Input(shape=(maxlen,))
model = Embedding(input_dim=len(word2id), output_dim=50, input_length=maxlen)(input)  # 50-dim embedding
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(len(class2id), activation="softmax"))(model)
model = Model(input, out)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])
model.fit(train_X, train_Y, batch_size=32, epochs=5)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x7fc7f6d73690>

In [31]:
res = model.evaluate(test_X,test_Y)



In [32]:
p = model.predict(test_X)

In [33]:
p[0]

array([[0.15600762, 0.08643273, 0.05339229, 0.03950289, 0.24727088,
        0.4173935 ],
       [0.16425225, 0.10161273, 0.0692241 , 0.05476965, 0.2387815 ,
        0.37135977],
       [0.16236204, 0.10985445, 0.08220138, 0.06752402, 0.2342214 ,
        0.34383672],
       [0.16791405, 0.12003491, 0.09197135, 0.07857449, 0.22251144,
        0.31899372],
       [0.1577529 , 0.1199783 , 0.09640178, 0.08246656, 0.21797825,
        0.32542217],
       [0.15759075, 0.12238329, 0.09842084, 0.0842876 , 0.21450791,
        0.32280964],
       [0.13912931, 0.11425178, 0.09257907, 0.07742217, 0.21672826,
        0.35988936],
       [0.12731816, 0.10552424, 0.08256885, 0.06679167, 0.21888618,
        0.39891085],
       [0.10238122, 0.08877945, 0.06649007, 0.05044181, 0.21806842,
        0.47383907],
       [0.07080916, 0.06428806, 0.04472227, 0.03092928, 0.20671298,
        0.5825382 ]], dtype=float32)