Import Libraries

In [None]:
import numpy as np 
import pandas as pd 
import nltk
import os
import gc
import tqdm
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import * 
from keras.utils import to_categorical
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
import re
import warnings
warnings.filterwarnings("ignore")

Extract Zip Files

In [None]:
from zipfile import ZipFile 
with ZipFile('../input/movie-review-sentiment-analysis-kernels-only/train.tsv.zip', 'r') as zip:
    zip.extractall() 
with ZipFile('../input/movie-review-sentiment-analysis-kernels-only/test.tsv.zip', 'r') as zip:
    zip.extractall() 

Import .tsv Files

In [None]:
train = pd.read_csv('./train.tsv', sep="\t")
test = pd.read_csv('./test.tsv', sep="\t")

Analyse the Data

In [None]:
train

In [None]:
train.Sentiment.value_counts()

In [None]:
train.columns

In [None]:
test

In [None]:
train.Sentiment.unique()

In [None]:
train.SentenceId.unique()

Balancing the Unbalanced Dataset

In [None]:
train_0 = train[train['Sentiment'] == 0].sample(frac=1)
train_1 = train[train['Sentiment'] == 1].sample(frac=1)
train_2 = train[train['Sentiment'] == 2].sample(frac=1)
train_3 = train[train['Sentiment'] == 3].sample(frac=1)
train_4 = train[train['Sentiment'] == 4].sample(frac=1)

# we want a balanced set for training against - there are 7072 `0` examples
sample_size = min(len(train_0), len(train_1), len(train_2), len(train_3), len(train_4))

train = pd.concat([train_0.head(sample_size), train_1.head(sample_size), train_2.head(sample_size), train_3.head(sample_size), train_4.head(sample_size)]).sample(frac=1)

In [None]:
train

Sentence Cleaning

In [None]:
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.stem import SnowballStemmer,WordNetLemmatizer
from string import punctuation
import re
from tqdm import tqdm

stemmer=SnowballStemmer('english')
lemma=WordNetLemmatizer()

def cleaner(phrase):
    cleaned=[]
    for i in tqdm(range(0,len(phrase))):
        review=str(phrase[i])
        review=re.sub('[^a-zA-Z]',' ',review)
        #review=[stemmer.stem(w) for w in word_tokenize(str(review).lower())]
        review=[lemma.lemmatize(w) for w in word_tokenize(str(review).lower())]
        review=' '.join(review)
        cleaned.append(review)
    return cleaned


In [None]:
train['cleaned_phrase']=cleaner(train.Phrase.values)
test['cleaned_phrase']=cleaner(test.Phrase.values)


In [None]:

train = train.drop(labels='Phrase',axis=1)
test = test.cleaned_phrase.values


In [None]:
train

In [None]:
from keras.utils import to_categorical
X = train.cleaned_phrase.values
y = train.Sentiment.values
y =to_categorical(y)

Splitting to train validation

In [None]:
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.20,stratify=y,random_state=50)

In [None]:
# Maximum Features
max_features=len(FreqDist(word_tokenize(' '.join(X_train))))
max_features

In [None]:
# The maximum sentence length
len_of_words=[]
for text in tqdm(X_train):
    max_phrase_len=np.max(len_of_words.append(len(word_tokenize(text))))

In [None]:
# converting text to seq and seq padding 

tokenizer = Tokenizer(num_words=max_features, oov_token='<unw>')
tokenizer.fit_on_texts(list(X_train))

X_train = tokenizer.texts_to_sequences(X_train)
X_train = sequence.pad_sequences(X_train, maxlen=max_phrase_len)

X_val = tokenizer.texts_to_sequences(X_val)
X_val = sequence.pad_sequences(X_val, maxlen=max_phrase_len)


In [None]:
X_train.shape

In [None]:
y_train.shape

Word GLove Embeddings

In [None]:

def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')
    
def get_embed_mat(EMBEDDING_FILE, max_features,embed_dim):
    # word vectors
    embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding='utf8'))
    print('Found %s word vectors.' % len(embeddings_index))

    # embedding matrix
    word_index = tokenizer.word_index
    num_words = min(max_features, len(word_index) + 1)
    all_embs = np.stack(embeddings_index.values()) #for random init
    embedding_matrix = np.random.normal(all_embs.mean(), all_embs.std(), 
                                        (num_words, embed_dim))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    max_features = embedding_matrix.shape[0]
    
    return embedding_matrix

EMBEDDING_FILE = '../input/glove6b100dtxt/glove.6B.100d.txt'
embed_dim = 100 #word vector dim
embedding_matrix = get_embed_mat(EMBEDDING_FILE,max_features,embed_dim)
EMBEDDING_DIM = embed_dim

Model Creations with Bidirectional LSTM cells and Glove embeddings 

In [None]:
model = Sequential()

model.add(Embedding(max_features, EMBEDDING_DIM,weights=[embedding_matrix],input_length=X_train.shape[1], trainable=False))
model.add(SpatialDropout1D(0.25))

model.add(Bidirectional(LSTM(128,return_sequences=True)))
model.add(Bidirectional(LSTM(64)))

model.add(Dropout(0.5))

model.add(Dense(5,activation='softmax'))

model.compile(optimizer=Adam(lr=0.001),
 loss='categorical_crossentropy',
 metrics=['accuracy'])

model.summary()


Training Time

In [None]:
history=model.fit(X_train, y_train, validation_data=(X_val, y_val),epochs=5, batch_size=64, verbose=1)

Plotting the Accuracy and Loss

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

Predictions on the test data

In [None]:
t = pd.read_csv('./test.tsv', sep="\t")
t

In [None]:
test = cleaner(t.Phrase.values)
test = tokenizer.texts_to_sequences(test)
test = sequence.pad_sequences(test, 48)

In [None]:
y_pred=model.predict_classes(test, verbose=1)

In [None]:
test = pd.read_csv('./test.tsv', sep="\t")

In [None]:
test['Sentiment'] = y_pred

In [None]:
test