In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
data = pd.read_csv("IMDB Dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
data.shape

(50000, 2)

In [4]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [5]:
data.replace({'sentiment':{'positive':1,'negative':0}}, inplace=True)

  data.replace({'sentiment':{'positive':1,'negative':0}}, inplace=True)


In [6]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [7]:
## spliting
train_data, test_data = train_test_split(data, test_size = 0.2,random_state=42)

In [8]:
train_data.shape, test_data.shape

((40000, 2), (10000, 2))

In [9]:
### data preprocessing
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(train_data['review'])

In [12]:
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']), maxlen=200)

In [13]:
X_train

array([[1935,    1, 1200, ...,  205,  351, 3856],
       [   3, 1651,  595, ...,   89,  103,    9],
       [   0,    0,    0, ...,    2,  710,   62],
       ...,
       [   0,    0,    0, ..., 1641,    2,  603],
       [   0,    0,    0, ...,  245,  103,  125],
       [   0,    0,    0, ...,   70,   73, 2062]], dtype=int32)

In [14]:
X_train.shape, X_test.shape

((40000, 200), (10000, 200))

In [15]:
y_train = train_data['sentiment']
y_test = test_data['sentiment']

In [16]:
y_train.shape, y_test.shape

((40000,), (10000,))

In [17]:
### LSTM
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))



In [18]:
model.summary()

In [19]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [21]:
model.fit(X_train,y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 274ms/step - accuracy: 0.7037 - loss: 0.5400 - val_accuracy: 0.8154 - val_loss: 0.4045
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 215ms/step - accuracy: 0.8503 - loss: 0.3571 - val_accuracy: 0.8574 - val_loss: 0.3407
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 225ms/step - accuracy: 0.8720 - loss: 0.3129 - val_accuracy: 0.8453 - val_loss: 0.3666
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 214ms/step - accuracy: 0.8857 - loss: 0.2906 - val_accuracy: 0.8656 - val_loss: 0.3404
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 218ms/step - accuracy: 0.8979 - loss: 0.2557 - val_accuracy: 0.8716 - val_loss: 0.3198


<keras.src.callbacks.history.History at 0x1d7af468a50>

In [23]:
model.evaluate(X_test, y_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 53ms/step - accuracy: 0.8742 - loss: 0.3062


[0.30594632029533386, 0.8772000074386597]

In [25]:
def predict_review(reveiw):
    ##tokenise and pad review
    sequence = tokenizer.texts_to_sequences(reveiw)
    pad_sequence = pad_sequences(sequence, maxlen=200)
    prediction = model.predict(pad_sequence)
    Sentiment = "positive" if prediction[0][0] >0.5 else "Negative"
    return Sentiment

In [43]:
review ='''Let me start out by saying: I love body horror and if you don't, or if you're squeamish, you might want to pass on this film. That said, I thought the balance between disturbing, impactful, intriguing, and disgusting was absolutely right on the money.

Every scene of this film wowed me at TIFF. The casting, atmosphere, visual and sound design, music, and, of course, the symbolism of it all.

Demi Moore and Margaret Qualley couldn't have been more perfectly cast as leads. Their devotion to the process and trust in their director was clear to me. Their vulnerability pulls this off and leads to a very compelling fever dream.

Everyone involved should be really proud of this film. To me, it's a future cult classic.

It is body/psychological horror, so don't expect jump scares. It's deeper than that. I found myself getting lost in reflection during the film (as a 47 year old woman it really spoke to me).

But it's also extremely fun to watch!

Take it seriously as a true work of art, but don't at the same time, and you'll have a blast.'''

In [44]:
predict_review(review)

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step


'positive'