In [1]:
!pip install kaggle



In [2]:
import os
import json

from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
kaggle_dictionary = json.load(open("kaggle.json"))

In [4]:
# setting kaggle credentials as environment variables
os.environ["KAGGLE_USERNAME"] = kaggle_dictionary["username"]
os.environ["KAGGLE_KEY"] = kaggle_dictionary["key"]

In [6]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
imdb-dataset-of-50k-movie-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)


In [7]:
!ls

imdb-dataset-of-50k-movie-reviews.zip  kaggle.json  sample_data


In [8]:
with ZipFile("imdb-dataset-of-50k-movie-reviews.zip",'r') as zip_ref:
  zip_ref.extractall()

In [9]:
!ls

'IMDB Dataset.csv'			 kaggle.json
 imdb-dataset-of-50k-movie-reviews.zip	 sample_data


In [10]:
df = pd.read_csv("/content/IMDB Dataset.csv")

In [11]:
df.shape

(50000, 2)

In [19]:
print(df.head())

                                              review  sentiment
0  One of the other reviewers has mentioned that ...          1
1  A wonderful little production. <br /><br />The...          1
2  I thought this was a wonderful way to spend ti...          1
3  Basically there's a family where a little boy ...          0
4  Petter Mattei's "Love in the Time of Money" is...          1


In [20]:
print(df.tail())

                                                  review  sentiment
49995  I thought this movie did a down right good job...          1
49996  Bad plot, bad dialogue, bad acting, idiotic di...          0
49997  I am a Catholic taught in parochial elementary...          0
49998  I'm going to have to disagree with the previou...          0
49999  No one expects the Star Trek movies to be high...          0


In [21]:
print(df['sentiment'].value_counts())

sentiment
1    25000
0    25000
Name: count, dtype: int64


In [18]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [22]:
train_data, test_data = train_test_split(df, test_size = 0.2, random_state = 42)

In [23]:
print(train_data.shape, test_data.shape)

(40000, 2) (10000, 2)


# Tokenizing the Text

In [26]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['review'])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']), maxlen=200)

In [30]:
print(X_train)
print(X_test)
print(X_train.shape)
print(X_test.shape)

[[1935    1 1200 ...  205  351 3856]
 [   3 1651  595 ...   89  103    9]
 [   0    0    0 ...    2  710   62]
 ...
 [   0    0    0 ... 1641    2  603]
 [   0    0    0 ...  245  103  125]
 [   0    0    0 ...   70   73 2062]]
[[   0    0    0 ...  995  719  155]
 [  12  162   59 ...  380    7    7]
 [   0    0    0 ...   50 1088   96]
 ...
 [   0    0    0 ...  125  200 3241]
 [   0    0    0 ... 1066    1 2305]
 [   0    0    0 ...    1  332   27]]
(40000, 200)
(10000, 200)


In [31]:
Y_train = train_data['sentiment']
Y_test = test_data['sentiment']

# Building the LSTM

In [40]:
model = Sequential()
model.add(Embedding(input_dim = 5000, output_dim = 128))
model.add(LSTM(128, dropout = 0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

In [41]:
model.build(input_shape=(None, 200))
model.summary()

In [42]:
model.compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = ["accuracy"]
)

In [44]:
history = model.fit(X_train, Y_train, epochs = 5, batch_size = 64, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 392ms/step - accuracy: 0.7798 - loss: 0.4790 - val_accuracy: 0.8393 - val_loss: 0.3746
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 387ms/step - accuracy: 0.8404 - loss: 0.3758 - val_accuracy: 0.8654 - val_loss: 0.3216
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 388ms/step - accuracy: 0.8834 - loss: 0.2943 - val_accuracy: 0.8671 - val_loss: 0.3111
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 391ms/step - accuracy: 0.8973 - loss: 0.2601 - val_accuracy: 0.8771 - val_loss: 0.3091
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 437ms/step - accuracy: 0.9024 - loss: 0.2556 - val_accuracy: 0.8769 - val_loss: 0.3173


In [46]:
loss, acc = model.evaluate(X_test, Y_test)
print("loss: ", loss)
print("acc: ", acc)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 112ms/step - accuracy: 0.8799 - loss: 0.3070
loss:  0.3073607385158539
acc:  0.8798999786376953


In [47]:
def predict_sentiment(review):
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen = 200)
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [48]:
new_review = "The movies lead actress is really beautiful"
sentiment = predict_sentiment(new_review)
print(sentiment)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
positive


In [51]:
new_review = "The movie is bad "
sentiment = predict_sentiment(new_review)
print(sentiment)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step
negative
