In [110]:
import numpy as np
import pandas as pd
import re

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anupa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [111]:
data = pd.read_csv('IMDB Dataset.csv')

In [112]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [113]:
data = data.loc[:1000]

In [114]:
data['sentiment'].value_counts()

positive    501
negative    500
Name: sentiment, dtype: int64

In [115]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     1001 non-null   object
 1   sentiment  1001 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB


In [116]:
#Finding out if there is any blank reviews

blanks = []

for i,rv,snt in data.itertuples():
    if rv.isspace():
        blanks.append(i)

In [117]:
blanks

[]

In [118]:
# Replacing the Sentiment into 1 and 0

map_dict = {'positive': 1, 'negative': 0}

data['sentiment'] = data['sentiment'].map(map_dict)

In [119]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [120]:
# Preprocessing the data

corpus = []

for i in range(0, len(data)):
    review = re.sub('[^a-zA-z]', ' ', data['review'][i])
    review = review.lower()
    review = review.split()
    
    review = [word for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [121]:
vocab_size = 5000

In [122]:
# Creating onehot representation

onehot_repr = [one_hot(word, vocab_size) for word in corpus]
onehot_repr

[[525,
  4483,
  4260,
  4573,
  1109,
  2253,
  4852,
  990,
  1693,
  3949,
  3731,
  3731,
  2571,
  460,
  2839,
  1109,
  3110,
  1157,
  3040,
  2758,
  3975,
  990,
  1346,
  2126,
  3138,
  370,
  2660,
  169,
  4371,
  370,
  1548,
  4799,
  1321,
  2990,
  2765,
  2758,
  702,
  2322,
  2785,
  1346,
  3731,
  3731,
  2836,
  1109,
  2552,
  2700,
  167,
  4630,
  280,
  1801,
  4401,
  4864,
  3607,
  3944,
  514,
  2342,
  384,
  4521,
  1036,
  2850,
  1242,
  4074,
  285,
  332,
  4092,
  990,
  3269,
  514,
  1780,
  4216,
  1495,
  4996,
  3158,
  1688,
  4533,
  3211,
  4983,
  1137,
  2615,
  1510,
  512,
  2760,
  3737,
  1670,
  2017,
  1550,
  4922,
  3731,
  3731,
  2641,
  302,
  118,
  3986,
  370,
  439,
  1915,
  4637,
  2288,
  3527,
  1251,
  3127,
  4337,
  3473,
  4473,
  2471,
  1251,
  3585,
  1251,
  4191,
  1109,
  927,
  2502,
  2571,
  2253,
  2110,
  3450,
  2839,
  3689,
  568,
  302,
  1284,
  2663,
  1619,
  288,
  1109,
  2261,
  768,
  4092,
  

### Embedding

In [123]:
review_max_length = 500
embedded_docs = pad_sequences(sequences= onehot_repr, maxlen= review_max_length, dtype='int32', padding='pre')
embedded_docs

array([[   0,    0,    0, ..., 4329, 3985, 4996],
       [   0,    0,    0, ..., 4966, 2351, 3643],
       [   0,    0,    0, ..., 2126, 4234, 2668],
       ...,
       [   0,    0,    0, ..., 3761,  506, 4960],
       [   0,    0,    0, ...,  391, 1594, 4818],
       [   0,    0,    0, ..., 3198, 3032, 4978]])

### Creating the model

In [124]:
embedding_features = 32
model = Sequential()

model.add(Embedding(vocab_size, embedding_features, input_length= review_max_length))
#model.add(Dropout(0.25))
model.add(LSTM(100, activation = 'relu'))
#model.add(Dropout(0.25))

model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer='adam', loss= 'binary_crossentropy', metrics= ['accuracy'])

### Separating the data into featues and labels

In [125]:
X = np.array(embedded_docs).astype(np.float32)
y = np.array(data['sentiment']).astype(np.float32)

In [126]:
X.shape, y.shape

((1001, 500), (1001,))

In [127]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### Training and evaluating the model

In [128]:
model.fit(X_train, y_train, batch_size= 64, epochs= 10, validation_data= (X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b90dff3308>

In [129]:
# Saving the model

model.save('review_sentiment.h5')