# Sentiment Analysis using pre-trained Glove Embeddings

In [104]:
import numpy as np 
import pandas as pd 
import nltk
from keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM
from keras.utils import to_categorical
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

In [105]:
df = pd.read_csv("train.tsv", delimiter = '\t')

In [106]:
print(df.shape)

(156060, 4)


### refining the data so tht it will be easy to train

In [107]:
from nltk.tokenize import word_tokenize
import re

In [108]:
corpus = []
from nltk.stem.porter import PorterStemmer
c = df['SentenceId'].unique()
n = 0
for i in df['SentenceId'] : 
    review = re.sub('[^a-zA-Z]',' ',df['Phrase'][n]) 
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)
    n = n+1
print(len(corpus))

156060


In [109]:
df['clean_review']=corpus
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,clean_review
0,1,1,A series of escapades demonstrating the adage ...,1,a seri of escapad demonstr the adag that what ...
1,2,1,A series of escapades demonstrating the adage ...,2,a seri of escapad demonstr the adag that what ...
2,3,1,A series,2,a seri
3,4,1,A,2,a
4,5,1,series,2,seri


In [110]:
df.shape

(156060, 5)

#### Reading the pre-trained GLove embedding file and then counting total number of words in it.

In [111]:
file =  "glove.6B.100d.txt"
f = open(file , encoding="utf8")
embedd_index = {}
for line in f:
    val = line.split()
    word = val[0]
    coff = np.asarray(val[1:],dtype = 'float')
    embedd_index[word] = coff
    
print(len(embedd_index))

400000


In [112]:
index_of_words = tokenizer.word_index
print(len(index_of_words))
embed_dims = 100
max_len = 1000
embedding_matrix = np.zeros((len(index_of_words)+1, embed_dims))

for word,i in index_of_words.items():
    temp = embedd_index.get(word)
    if temp is not None:
        embedding_matrix[i] = temp
print(type(index_of_words))
print(embedding_matrix.shape)

10607
<class 'dict'>
(10608, 100)


In [113]:
train_text=df.clean_review.values
t = df.Sentiment.values

In [114]:
y=to_categorical(t)
print(train_text.shape,target.shape,y.shape)

(156060,) (156060,) (156060, 5)


In [115]:
x_train_text,x_val_text,y_train,y_val=train_test_split(train_text,y,test_size=0.2)

In [116]:
print(x_train_text.shape,y_train.shape)
print(x_val_text.shape,y_val.shape)

(124848,) (124848, 5)
(31212,) (31212, 5)


#### counting the maximum number of words in the each review

In [117]:
r_len=[]
for text in x_train_text:
    word=word_tokenize(text)
    l=len(word)
    r_len.append(l)
    
max_len=np.max(r_len)
max_len

48

In [118]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(x_train_text))

In [119]:
X_train = tokenizer.texts_to_sequences(x_train_text)
X_val = tokenizer.texts_to_sequences(x_val_text)

In [120]:
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_val = sequence.pad_sequences(X_val, maxlen=max_words)
print(X_train.shape,X_val.shape)

(124848, 48) (31212, 48)


In [121]:
max_features = embedding_matrix.shape[0]
max_words = max_len
batch_size = 128

## Buliding the model with the pre-trained embeddings and LSTM

In [122]:
model = Sequential()
model.add(Embedding(max_features, 100, input_length=X_train.shape[1],weights=[embedding_matrix],trainable=True))
model.add(LSTM(64,return_sequences=False))
model.add(Dense(5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 48, 100)           1060800   
_________________________________________________________________
lstm_5 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_6 (Dense)              (None, 5)                 325       
Total params: 1,103,365
Trainable params: 1,103,365
Non-trainable params: 0
_________________________________________________________________


In [123]:
model.fit(X_train, y_train, validation_data=(X_val, y_val),epochs=4, batch_size=batch_size, verbose=1)

Train on 124848 samples, validate on 31212 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x20d149cff60>