In [1]:
!pip install pyvi
!pip install tensorflow==1.14.0

Collecting pyvi
[?25l  Downloading https://files.pythonhosted.org/packages/2c/27/27ffee2663f42430cf3434da963f04224fec157b90799fe9e92a3564c1a6/pyvi-0.1.1-py2.py3-none-any.whl (8.5MB)
[K     |████████████████████████████████| 8.5MB 906kB/s eta 0:00:01
[?25hCollecting sklearn-crfsuite (from pyvi)
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite->pyvi)
[?25l  Downloading https://files.pythonhosted.org/packages/5b/b9/b6f48d74e10136ccfafbadcae751f3e81d143b40847d0f20728026783834/python-crfsuite-0.9.8.tar.gz (440kB)
[K     |████████████████████████████████| 440kB 40.4MB/s eta 0:00:01
Building wheels for collected packages: python-crfsuite
  Building wheel for python-crfsuite (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/40/fa/ee/3afb15958ad26f3aef88d61c316b4d1af8a97660aa24e6e6d7
Success

In [None]:
import os
#Ignoring the warnings
import warnings
warnings.filterwarnings('ignore')

#Importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re, string, unicodedata
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.pooling import GlobalMaxPooling1D
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
from keras.layers import *
from keras import backend
from sklearn.metrics import f1_score, confusion_matrix
import tensorflow as tf

from pyvi import ViTokenizer
from pyvi import ViUtils

### Importing the dataset
***

The dataset: 'imdb_master.csv' is read and loaded as pandas dataframe.  
Let's have a look at the data

# Importing the dataset


In [None]:
df2 = pd.read_csv('/kaggle/input/int3405-sentiment-analysis-problem/full_train.csv')
df2.head()

In [None]:
data_train2 = pd.DataFrame({'input':df2['Comment'],'label':df2['Rating']})
data_train2.label = data_train2.label.apply(lambda x: 1. if x == 1 else 0.)
data_train2 = data_train2.dropna()
data_train2 = data_train2.reset_index(drop=True)
X_train = list(data_train2['input'].values)
y_train = list(data_train2['label'].values)

### Text Preprocessing
***
Preprocessing the text so as to have a better data for our model.  
It comprises of steps such as removing non-ASCII characters, removing HTML tags, converting to lower-case, lemmatizing.

In [None]:
#Function for Text Preprocessing
def clean_text(X,y):
    idx = 0
    y_train = []
    processed = []
    for text in X:
        text = list(tf.keras.preprocessing.text.text_to_word_sequence(text))
        text = " ".join(text)
        input_text_pre_no_accent = str(ViUtils.remove_accents(text).decode("utf-8"))
        input_text_pre_accent = ViTokenizer.tokenize(text)
        input_text_pre_no_accent = ViTokenizer.tokenize(input_text_pre_no_accent)
        processed.append(input_text_pre_accent)
        processed.append(input_text_pre_no_accent)
        y_train.append(y[idx])
        y_train.append(y[idx])
        idx += 1
    return processed,y_train

Preprocessing the Training Set and Test set

In [None]:
X_train_final,y_train = clean_text(X_train,y_train)

### Attention Layer
***

The basic concept of attention is that not all words contribute equally to the meaning of a sentence. Hence, their contribution must be weighted.  
How attention works is, it basically extracts words that are important to the meaning of the sentence and aggregate the representation of those informative words to form a sentence vector.

In [None]:
# Attention Layer
class AttentionWithContext(Layer):

    def __init__(self, W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

def dot_product(x, kernel):
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)

Some Useful Variables  


In [None]:
#Tokenization and Padding
vocab_size = 60000
maxlen = 300
encode_dim = 20
batch_size = 32
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_final)
tokenized_word_list = tokenizer.texts_to_sequences(X_train_final)
X_train_padded = pad_sequences(tokenized_word_list, maxlen = maxlen, padding='post')

In [None]:
#ModelCheckpoint
checkpoint = ModelCheckpoint('sentiment_classifier.h5', monitor = 'val_acc', mode = 'max', verbose = 1, save_best_only = True)

### Building the Model
***
The model used comprises of CuDNNLSTM with Attention layer on top of it, followed by a dense layer and finally a dense layer with sigmoid activation function to get the sentiment or the class.  
Optimiser used is ADAM

In [None]:
# Building the model
model = Sequential()
embed = Embedding(input_dim = vocab_size, output_dim = 32, input_length = X_train_padded.shape[1], dropout = 0.4) 
model.add(embed)
model.add(Bidirectional(CuDNNLSTM(256, return_sequences = True)))
model.add(Dropout(0.5))
model.add(AttentionWithContext())
model.add(Dropout(0.5))
model.add(Dense(512))
model.add(ReLU())
model.add(Dense(256))
model.add(ReLU())
model.add(Dense(1, activation = 'sigmoid'))
from keras.optimizers import adam
optim =  adam(lr=1e-4)
model.compile(loss = 'binary_crossentropy', 
              optimizer = optim, 
              metrics = ['accuracy'])
model.summary()

### Training
***
Splitting the Training set into Training set and Validation set

In [None]:
from sklearn.model_selection import train_test_split
X_train_final2, X_val, y_train_final2, y_val = train_test_split(X_train_padded, y_train, test_size = 0.1)

In [None]:
#class weight
weight_for_0 = (1 / 0.27)#*((len(y))/2.0 )
weight_for_1 = (1 / 0.73)#*((len(y))/2.0)
class_weight = {0: weight_for_0, 1: weight_for_1}

In [None]:
checkpoint = ModelCheckpoint('sentiment_classifier.h5', monitor = 'val_acc', mode = 'max', verbose = 1, save_best_only = True)

In [None]:
#Fitting the model
batch_size= 256

model.fit(X_train_final2, y_train_final2, 
          epochs = 35, batch_size = batch_size, verbose = 1,
          validation_data = [X_val, y_val],
          callbacks = [checkpoint],class_weight=class_weight)

### Testing
***
Converting the test data into sequences of integers and padding them.  
Loading the best model and calculating the accuracy

In [None]:
df = pd.read_csv('/kaggle/input/int3405-sentiment-analysis-problem/test.csv')
data_test = pd.DataFrame({'input':df['Comment'],'id':df["RevId"]})
X_test = data_test['input'].values

def clean_text_test(X):
    processed = []
    for text in X:
        text = list(tf.keras.preprocessing.text.text_to_word_sequence(str(text)))
        text = " ".join(text)
        input_text_pre_no_accent = str(ViUtils.remove_accents(text).decode("utf-8"))
        input_text_pre_accent = ViTokenizer.tokenize(text)
        processed.append(input_text_pre_accent)
    return processed

X_test_final = clean_text_test(X_test)

tokenized_word_list = tokenizer.texts_to_sequences(X_test_final)
X_test_padded = pad_sequences(tokenized_word_list, maxlen = maxlen, padding='post')
y_pred = model.predict(X_test_padded)

In [None]:
from keras.models import load_model
modelload = load_model('sentiment_classifier.h5', custom_objects = {"AttentionWithContext" : AttentionWithContext, "backend" : backend})
y_pred = modelload.predict(X_test_padded)

In [None]:
my_submission['Rating'].sum()

In [None]:
my_submission = pd.DataFrame({'RevId': np.array(df["RevId"]).reshape(5103), 'Rating': np.array(y_pred).reshape(5103)})
my_submission.to_csv('submitTrain.csv', index=False)