# Sentiment Analysis using pre-trained GLove Embeddings

### Glove Embeddings are the vector representation of the words trained on global word-word co-occurrence statistics from a corpus that basically count how frequently a word appears in a context.

### Glove embeddings are generally preferred over word2vec on a larger dataset as they basically work on the principle of dimensionality reduction.

In [65]:
import numpy as np 
import pandas as pd 
import nltk
from keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM
from keras.utils import to_categorical
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

### Inputing the dataset

In [66]:
df = pd.read_csv("train.tsv", delimiter = '\t')

In [67]:
print(df.shape)

(156060, 4)


In [68]:
from nltk.tokenize import word_tokenize
import re

### Cleaning the data so that it becomes easier to train

### stem function of nltk filters the tense of the word

In [69]:
corpus = []
from nltk.stem.porter import PorterStemmer
c = df['SentenceId'].unique()
n = 0
for i in df['SentenceId'] : 
    review = re.sub('[^a-zA-Z]',' ',df['Phrase'][n]) 
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)
    n = n+1
print(len(corpus))

156060


In [70]:
df['clean_review']=corpus
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,clean_review
0,1,1,A series of escapades demonstrating the adage ...,1,a seri of escapad demonstr the adag that what ...
1,2,1,A series of escapades demonstrating the adage ...,2,a seri of escapad demonstr the adag that what ...
2,3,1,A series,2,a seri
3,4,1,A,2,a
4,5,1,series,2,seri


In [71]:
df.shape

(156060, 5)

In [72]:
train_text=df.clean_review.values
t = df.Sentiment.values

In [73]:
y=to_categorical(t)
print(train_text.shape,y.shape)
print

(156060,) (156060, 5)


In [74]:
x_train_text,x_val_text,y_train,y_val=train_test_split(train_text,y,test_size=0.2)

In [75]:
print(x_train_text.shape,y_train.shape)
print(x_val_text.shape,y_val.shape)

(124848,) (124848, 5)
(31212,) (31212, 5)


In [76]:
r_len=[]
for text in x_train_text:
    word=word_tokenize(text)
    l=len(word)
    r_len.append(l)
    
max_len=np.max(r_len)
max_len
print(word)

['not', 'count', 'a', 'few', 'gross', 'out', 'comedi', 'i', 've', 'been', 'tri', 'to', 'forget']


In [77]:
from nltk import FreqDist
all_words=' '.join(x_train_text)
all_words=word_tokenize(all_words)
dist=FreqDist(all_words)
num_unique_word=len(dist)
num_unique_word

max_features = num_unique_word

In [78]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(x_train_text))
print(x_train_text[0])

children s song


In [79]:
X_train = tokenizer.texts_to_sequences(x_train_text)
X_val = tokenizer.texts_to_sequences(x_val_text)
print(X_train[0])

[377, 7, 827]


In [80]:
X_train = sequence.pad_sequences(X_train, maxlen=max_len)
X_val = sequence.pad_sequences(X_val, maxlen=max_len)
print(X_train.shape,X_val.shape)

(124848, 48) (31212, 48)


In [81]:
file =  "glove.6B.100d.txt"
f = open(file , encoding="utf8")
embedd_index = {}
for line in f:
    val = line.split()
    word = val[0]
    coff = np.asarray(val[1:],dtype = 'float')
    embedd_index[word] = coff
    
print(len(embedd_index))

400000


In [82]:
index_of_words = tokenizer.word_index
print(len(index_of_words))
embed_dims = 100
max_len = 1000
embedding_matrix = np.zeros((len(index_of_words)+1, embed_dims))

for word,i in index_of_words.items():
    temp = embedd_index.get(word)
    if temp is not None:
        embedding_matrix[i] = temp
print(type(index_of_words))
print(embedding_matrix.shape)

10609
<class 'dict'>
(10610, 100)


In [83]:
max_features = embedding_matrix.shape[0]
max_words = max_len
batch_size = 128

In [84]:
model = Sequential()
model.add(Embedding(max_features, 100, input_length=X_train.shape[1],weights=[embedding_matrix],trainable=True))
model.add(LSTM(64,return_sequences=False))
model.add(Dense(5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 48, 100)           1061000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 325       
Total params: 1,103,565
Trainable params: 1,103,565
Non-trainable params: 0
_________________________________________________________________


In [85]:
model.fit(X_train, y_train, validation_data=(X_val, y_val),epochs=4, batch_size=batch_size, verbose=1)

Train on 124848 samples, validate on 31212 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x227170546d8>

In [97]:
def pred(rev):
    rev = re.sub('[^a-zA-Z]',' ',rev) 
    rev = rev.lower()
    rev = rev.split()
    rev = [ps.stem(word) for word in rev]
    rev = ' '.join(rev)
    rev = tokenizer.texts_to_sequences(rev)
    rev = sequence.pad_sequences(rev, maxlen=48)
    return(model.predict_classes(rev))

In [99]:
inp = input()
print(pred(inp)[0])

good movie
2
