In [2]:
import numpy as np
import pandas as pd
import re
from string import digits

from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split

from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Dense, Embedding, Flatten, Dropout, Input, Bidirectional
from tensorflow.keras import Model
from tensorflow.keras.utils import to_categorical

In [3]:
maxlen = 50 

In [4]:
def read_data(file_name="IMDB Dataset.csv"):
    data_path = f"{file_name}"
    df = pd.read_csv(data_path)
    return df

In [5]:
df = read_data()
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
def preprocess(df):
    # convert source and target text to Lowercase
    df.review = df.review.apply(lambda x: x.lower())
    
    # creating a space between a word and the punctuation following it
    df.review = df.review.apply(lambda x: re.sub(r"([?.!,¿])", r" \1 ", x))
    df.review = df.review.apply(lambda x: re.sub(r'[" "]+', " ", x))
    
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    df.review = df.review.apply(lambda x: re.sub(r"[^a-zA-Z?.!,¿]+", " ", x))
    
    # Remove digits from source and target sentences
    num_digits = str.maketrans('', '', digits)
    df.review = df.review.apply(lambda x: x.translate(num_digits))
    
    # Remove extra spaces
    df.review = df.review.apply(lambda x: x.strip())

    df.review = df.review.apply(lambda x: re.sub(" +", " ", x))

    return df

In [7]:
df = preprocess(df)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production . br br the film...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there s a family where a little boy ...,negative
4,petter mattei s love in the time of money is a...,positive


In [8]:
#labeling the sentient data
lb=LabelBinarizer()
#transformed sentiment data
df['sentiment']=lb.fit_transform(df['sentiment'])
print(df.shape)

(50000, 2)


In [9]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production . br br the film...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there s a family where a little boy ...,0
4,petter mattei s love in the time of money is a...,1


In [10]:
def tokenization(data, maxlength = 100):
    token = Tokenizer(lower=True, oov_token='oov')
    token.fit_on_texts(data)
    
    data_seq = token.texts_to_sequences(data)
    data_pad = pad_sequences(data_seq, maxlen=maxlength, padding='post')
    
    return token, data_pad

In [11]:
token, X = tokenization(df['review'], maxlength=maxlen)

In [12]:
print(X.shape, df['review'].shape)

(50000, 50) (50000,)


In [13]:
vocab_size = len(token.word_index) 
vocab_size

99428

In [14]:
reverse_word_index = {v: k for k, v in token.word_index.items()}

# Glove

In [15]:
# creating glove vectors
def get_glove_vector():
    glove_vectors = {}

    with open("glove/glove.6B.200d.txt", "r", encoding="UTF-8") as glove:
        for line in glove:
            values = line.split()
            word = values[0]
            vectors = np.asarray(values[1:])
            glove_vectors[word] = vectors
    return glove_vectors

In [16]:
glove_vectors = get_glove_vector()
total_words = len(glove_vectors.keys()) 
total_words

400000

In [23]:
emb_dim = 200


# create word vector matrix with glove vectors
def create_word_vector_matrix(token, glove_vectors, vocab_size, emb_dim):
    word_vector_matrix = np.zeros((vocab_size+1, emb_dim))
    
    count = 0
    for word, index in token.word_index.items():
        vector = glove_vectors.get(word)
        if vector is not None:
            word_vector_matrix[index] = vector
        else:
            count += 1
    print(f"Vector not found for {count} words")
    return word_vector_matrix

In [18]:
emb_matrix = create_word_vector_matrix(token, glove_vectors, vocab_size, emb_dim)

Vector not found for 25116 words


In [19]:
emb_matrix.shape

(99429, 200)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'].to_numpy(), test_size=0.33, random_state=42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(33500, 50) (33500,) (16500, 50) (16500,)


In [24]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size+1, 
                    output_dim=emb_dim, 
                    input_length=maxlen,
                    weights=[emb_matrix],
                    trainable=True))
# model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dense(units=256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 200)           19885800  
_________________________________________________________________
lstm (LSTM)                  (None, 128)               168448    
_________________________________________________________________
dense (Dense)                (None, 256)               33024     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 257       
Total params: 20,087,529
Trainable params: 20,087,529
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.fit(X_train, y_train, batch_size=64, epochs=1, validation_split=0.2)

KeyboardInterrupt: 

In [None]:
ans = model.predict(X_val)
print(ans.shape)
ans