In [None]:
#read input
f=open('/content/drive/My Drive/Colab Notebooks/dataset/sentiment.txt','r')
lines=f.readlines()
f.close()
print (len(lines))


In [None]:
import re
import numpy as np

stopwords = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount",  "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as",  "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"]

def text_cleaner(text): 
    text=remove_link(text.lower())
    long_words=[]
    for i in text.split():
        if i not in stopwords:                  
            long_words.append(i)
    return long_words

def remove_link(text):
    regex = r'https?://[^\s<>)"‘’]+'
    match = re.sub(regex,' ', text)
    regex = r'https?:|urls?|[/\:,-."\'?!;…]+'
    tweet = re.sub(regex,' ', match)
    tweet = re.sub("[^a-zA-Z_]", " ", tweet)
    tweet = re.sub("[ ]+", " ", tweet) 
    return tweet


**Cross validation**

In [None]:
#divide the data into train set and test set
import random
random.seed(1)
random.shuffle(lines) #shuffle the dataset before dividing it into train and test set
split_size = int(0.8*len(lines)) #use 80% of total data as train set and 20% as test set
train_lines = lines[:split_size]
test_lines = lines[split_size:]

print ("Training set size : ", len(train_lines))
print ("Test set size : ", len(test_lines))


Training set size :  914
Test set size :  229


In [None]:
#convert string tokens to integers
#create a vocabulary set and assign a unique id to each word in the vocabulary

#load all unique vocabulary
vocab = []
maxlen = []
for l in train_lines:
    s = l.strip().split('\t')
    words = text_cleaner(s[0].strip())
    vocab += words
    maxlen.append(len(words))
vocab = list(set(vocab))
print ("Vocabulary size : ", len(vocab))

#assign unique id to each vocabulary
word2id = dict()
for i,v in enumerate(vocab,1):
    word2id[v] = i
word2id['PAD'] = 0 #special token to take care of unseen words in the test set
maxlen = max(maxlen)
print ("Max sentence length : ",maxlen)


Vocabulary size :  4012
Max sentence length :  19


In [None]:
#Prepare train and test set
#Convert strings to integers
#prepare train and test set

#prepare train set
import numpy as np
train_X = []
train_Y = []
for l in train_lines:
    s = l.strip().split('\t')
    text = text_cleaner(s[0])
    label = int(s[1].strip())
    temp_x = [word2id[x] for x in text]
    temp_x += [0]* (maxlen-len(temp_x)) #convert all input to equal size to enable training in batches
    temp_y = [0]*2
    temp_y[label] = 1
    train_X.append(temp_x)
    train_Y.append(temp_y)
print (len(train_X))
print (len(train_Y))
print (train_X[0])
print (train_Y[0])
train_X = np.array(train_X)
train_Y = np.array(train_Y) 

914
914
[2639, 3937, 2155, 821, 1900, 495, 495, 3442, 445, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 0]


In [None]:
#prepare test set
test_X = []
test_Y = []

for l in test_lines:
    s = l.strip().split('\t')
    text = text_cleaner(s[0].strip())
    label = int(s[1].strip())
    temp_x = [word2id[x] if x in word2id else 0 for x in text ][:maxlen]
    temp_x += [0]* (maxlen-len(temp_x)) #convert all input to equal size to enable training in batches
    temp_y = [0]*2
    temp_y[label] = 1
    test_X.append(temp_x)
    test_Y.append(temp_y)
print (len(test_X))
print (len(test_Y))
print (test_X[0])
print (test_Y[0])
test_X = np.array(test_X)
test_Y = np.array(test_Y)

229
229
[2750, 2299, 2121, 2651, 0, 0, 1336, 3701, 3, 445, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 0]


In [None]:
from keras.models import Sequential #Sequential is the class in Keras library that defines a model comprising of linear stack of models
from keras.layers import Dense
##initialize a sequential model
model = Sequential() 

#Add layers to the sequential model
#For the first hidden layer, input_dim must be mentioned which is the dimension of the input that the hidden layer receives from the input layer
#The below line adds a hidden layer of 50 nodes that is connected to a input layer with maxlen nodes
#tanh activation function applies nonlinear transformation to the output of the hidden layer
model.add(Dense(50,input_dim=maxlen,activation='tanh')) 

#Add output layer with two nodes since our dataset comprises of two classes
#Softmax converts the output to a probability distribution.
model.add(Dense(2,activation='softmax'))

#Compile the model by specifying the optimizers and loss functions
model.compile(optimizer='sgd', loss='categorical_crossentropy',metrics=['accuracy'])

#train the model
model.fit(train_X, train_Y, epochs=20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f3831989da0>

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 50)                1000      
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 102       
Total params: 1,102
Trainable params: 1,102
Non-trainable params: 0
_________________________________________________________________


In [None]:
res = model.evaluate(test_X,test_Y)



In [None]:
res

[0.7059223055839539, 0.6331877708435059]

In [None]:
p = model.predict(test_X)

In [None]:
for i,l in enumerate(test_lines[:5]):
  print (l.strip(),p[i])

We blame cities for the majority of CO2 emissions without acknowledging their vulnerability to #CFCC15 #journey2015 #S2228 #SemST	0 [0.5820220708847046, 0.417977899312973]
Feminists who go for a gender studies degree should also blame the patriarchy for their mediocre grades in science. #SemST	0 [0.7086774706840515, 0.2913225591182709]
Just wrote my blog to help @CalAlimony pass a vital law that ends #alimony. Posting soon. #Divorce #leanin #SemST	1 [0.8042789101600647, 0.1957210749387741]
RT @JohnFugelsang: They should just make the GOP primaries a reality game show called "Who Wants To Get Beat Up By A Girl? #SemST	0 [0.46765077114105225, 0.532349169254303]
It's incredibly easy to identify shitty females with a poor view on the world and what's important thanks to #SemST	0 [0.8346851468086243, 0.16531486809253693]


In [None]:
#Generate word2vec embeddings using gensim
import gensim, logging, os
from gensim import corpora
from collections import defaultdict
from pprint import pprint
import numpy as np

import time
t1 = time.time()
texts = [[word for word in text_cleaner(line.strip().split('\t')[0])]  for line in open('/content/drive/My Drive/Colab Notebooks/dataset/sentiment.txt')]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
model = gensim.models.Word2Vec(texts, min_count=0,sample=0.001, seed=1, workers=8, min_alpha=0.0001, sg=0, hs=0, negative=5,iter=100,size=50)


In [None]:
##word2vec model trained
##Now dump the generated embeddings into text files
f2=open('/content/drive/My Drive/Colab Notebooks/embeddings/words.txt','w')
X = []
for key, value in dictionary.token2id.items() :
    #print (key, value)
    f2.write(key+"\n")
    X.append(list(model[key]))
f2.close()

fp=open('/content/drive/My Drive/Colab Notebooks/embeddings/vectors.txt','w')
for w in list(X):
    x=list(w)
    for wi in x:
        fp.write(str(wi).replace('\n','')+'\t')
    fp.write('\n')
fp.close()
t2 = time.time()
print ("Total time taken ",t2-t1)

  


Total time taken  159.20494103431702


In [None]:
##train a network using the generated embeddings as features
#first load the embeddings from the files into a matrix
f = open('/content/drive/My Drive/Colab Notebooks/embeddings/words.txt')
words = f.readlines()
f.close()
f = open('/content/drive/My Drive/Colab Notebooks/embeddings/vectors.txt')
vectors = f.readlines()
f.close()

embeddings = dict()
for i,w in enumerate(words):
  embeddings[w.strip()] = np.array([float(x) for x in vectors[i].strip().split('\t')])
embedding_size = len(vectors[i].strip().split('\t'))


In [None]:
#for each sentence in the training set, generate sentence embeddings by averaging word embeddings
train_X = []
train_Y = []
for l in train_lines:
    s = l.lower().strip().split('\t')
    text = text_cleaner(s[0].strip())
    label = int(s[1].strip())
    temp_x = np.array([0]*embedding_size)
    #represent each sentence by a sentence embedding obtained by averaging word embeddings
    for v in text:
        if v in embeddings:
            temp_x = np.add(temp_x,embeddings[v])
        else:
            temp_x = np.add(temp_x,np.array([0]*embedding_size))
    temp_x = np.true_divide(temp_x,len(text))
    
    #represent the labels as one-hot vectors
    temp_y = [0]*2
    temp_y[label] = 1
    
    train_X.append(temp_x)
    train_Y.append(temp_y)
    
print (len(train_X))
print (len(train_Y))
print (train_X[1])
print (train_Y[1])
train_X = np.array(train_X)
train_Y = np.array(train_Y)


914
914
[ 0.34400658  0.27160275  0.01452614  0.24148613  0.65151931 -0.30304693
 -0.26461193  0.55967513  2.00598388  0.24282182 -0.9591095  -0.36426097
 -0.27262755 -0.50897189  0.37229868 -0.86615524 -0.90331611  0.07527125
 -1.27128243  1.16274885  0.32932287 -0.01681146  0.72154313 -0.16764784
 -0.19144638 -0.3907647   1.31546959  0.1781115   1.63848702  0.35916511
  0.47530726  0.13169616  0.75663928 -0.94713477  0.33700043 -0.06840539
  0.18772569  1.31664211 -0.86507743 -1.16744186  0.34270857  0.11727037
  0.12892523  0.38072168 -0.98211067 -1.09526317  0.55164504 -0.95999779
  0.21279183 -0.58756922]
[1, 0]


In [None]:
#Similarly represent the test set using word embeddings
test_X = []
test_Y = []
for l in test_lines:
    s = l.lower().strip().split('\t')
    text = text_cleaner(s[0].strip())
    label = int(s[1].strip())
    temp_x = np.array([0]*embedding_size)
    #represent each sentence by a sentence embedding obtained by averaging word embeddings
    for v in text:
        if v in embeddings:
            temp_x = np.add(temp_x,embeddings[v])
        else:
            temp_x = np.add(temp_x,np.array([0]*embedding_size))
    temp_x = np.true_divide(temp_x,len(text))
    
    #represent the labels as one-hot vectors
    temp_y = [0]*2
    temp_y[label] = 1
    
    test_X.append(temp_x)
    test_Y.append(temp_y)
    
print (len(test_X))
print (len(test_Y))
print (test_X[1])
print (test_Y[1])
test_X = np.array(test_X)
test_Y = np.array(test_Y)


229
229
[ 1.43168587e-01  3.62731191e-01 -8.37483342e-02 -4.34456608e-01
 -2.62502466e-02  6.23550179e-02 -1.30325140e-04 -1.78436619e-01
  8.72878291e-01  1.65309610e-02 -1.06327978e+00 -4.53096895e-01
  1.00373781e-01 -2.79473894e-01  2.51277588e-01 -2.30466514e-01
 -7.11768351e-01 -3.55504187e-01 -1.14342608e-01  3.92006466e-01
  1.69326919e-01  3.89009905e-01  4.11167598e-01 -1.55029858e-01
 -7.80328339e-02 -2.64823194e-02 -9.56052329e-02  9.48695151e-01
  3.96399158e-01  1.87547960e-01  7.51962853e-03 -2.87542269e-02
  2.93273877e-01 -2.21789745e-01  5.25936502e-02  2.74468541e-01
 -8.94138189e-02  5.75865273e-01  7.20069220e-03 -2.90708786e-01
  2.23370578e-01 -2.26276361e-01 -2.15931485e-01  3.35309095e-01
 -3.74575353e-01 -3.80146742e-01 -6.00782197e-02 -7.29343560e-01
 -3.25310811e-01 -3.18310834e-01]
[1, 0]


In [None]:
#define a model and train the model
model = Sequential() 
model.add(Dense(50,input_dim=embedding_size,activation='tanh')) 
model.add(Dense(2,activation='softmax'))
model.compile(optimizer='sgd', loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(train_X, train_Y, epochs=20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f381f3f95c0>

In [None]:
res = model.evaluate(test_X,test_Y)



In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestCentroid
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

In [None]:
#for each sentence in the training set, generate sentence embeddings by averaging word embeddings
train_X = []
train_Y = []
for l in train_lines:
    s = l.lower().strip().split('\t')
    text = text_cleaner(s[0].strip())
    label = int(s[1].strip())
    temp_x = np.array([0]*embedding_size)
    #represent each sentence by a sentence embedding obtained by averaging word embeddings
    for v in text:
        if v in embeddings:
            temp_x = np.add(temp_x,embeddings[v])
        else:
            temp_x = np.add(temp_x,np.array([0]*embedding_size))
    temp_x = np.true_divide(temp_x,len(text))
    
    #represent the labels as one-hot vectors
    train_X.append(temp_x)
    train_Y.append(label)
    
print (len(train_X))
print (len(train_Y))
print (train_X[1])
print (train_Y[1])
train_X = np.array(train_X)
train_Y = np.array(train_Y)

914
914
[ 0.34400658  0.27160275  0.01452614  0.24148613  0.65151931 -0.30304693
 -0.26461193  0.55967513  2.00598388  0.24282182 -0.9591095  -0.36426097
 -0.27262755 -0.50897189  0.37229868 -0.86615524 -0.90331611  0.07527125
 -1.27128243  1.16274885  0.32932287 -0.01681146  0.72154313 -0.16764784
 -0.19144638 -0.3907647   1.31546959  0.1781115   1.63848702  0.35916511
  0.47530726  0.13169616  0.75663928 -0.94713477  0.33700043 -0.06840539
  0.18772569  1.31664211 -0.86507743 -1.16744186  0.34270857  0.11727037
  0.12892523  0.38072168 -0.98211067 -1.09526317  0.55164504 -0.95999779
  0.21279183 -0.58756922]
0


In [None]:
#Similarly represent the test set using word embeddings
test_X = []
test_Y = []
for l in test_lines:
    s = l.lower().strip().split('\t')
    text = text_cleaner(s[0].strip())
    label = int(s[1].strip())
    temp_x = np.array([0]*embedding_size)
    #represent each sentence by a sentence embedding obtained by averaging word embeddings
    for v in text:
        if v in embeddings:
            temp_x = np.add(temp_x,embeddings[v])
        else:
            temp_x = np.add(temp_x,np.array([0]*embedding_size))
    temp_x = np.true_divide(temp_x,len(text))
    
    #represent the labels as one-hot vectors
    
    
    test_X.append(temp_x)
    test_Y.append(label)
    
print (len(test_X))
print (len(test_Y))
print (test_X[1])
print (test_Y[1])
test_X = np.array(test_X)
test_Y = np.array(test_Y)

229
229
[ 1.43168587e-01  3.62731191e-01 -8.37483342e-02 -4.34456608e-01
 -2.62502466e-02  6.23550179e-02 -1.30325140e-04 -1.78436619e-01
  8.72878291e-01  1.65309610e-02 -1.06327978e+00 -4.53096895e-01
  1.00373781e-01 -2.79473894e-01  2.51277588e-01 -2.30466514e-01
 -7.11768351e-01 -3.55504187e-01 -1.14342608e-01  3.92006466e-01
  1.69326919e-01  3.89009905e-01  4.11167598e-01 -1.55029858e-01
 -7.80328339e-02 -2.64823194e-02 -9.56052329e-02  9.48695151e-01
  3.96399158e-01  1.87547960e-01  7.51962853e-03 -2.87542269e-02
  2.93273877e-01 -2.21789745e-01  5.25936502e-02  2.74468541e-01
 -8.94138189e-02  5.75865273e-01  7.20069220e-03 -2.90708786e-01
  2.23370578e-01 -2.26276361e-01 -2.15931485e-01  3.35309095e-01
 -3.74575353e-01 -3.80146742e-01 -6.00782197e-02 -7.29343560e-01
 -3.25310811e-01 -3.18310834e-01]
0


In [None]:
gnb= GaussianNB()
model = gnb.fit(train_X,train_Y)

In [None]:
y = model.predict(test_X)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
print("Accuracy ",accuracy_score(test_Y, y, normalize = True))

Accuracy  0.7205240174672489


In [None]:
svml=svm.LinearSVC()

In [None]:
model = svml.fit(train_X,train_Y)



In [None]:
y = model.predict(test_X)

In [None]:
print("Accuracy ",accuracy_score(test_Y, y, normalize = True))

Accuracy  0.7379912663755459
