In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [39]:
df = pd.read_csv("./IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [40]:
df.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [41]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [42]:
df.shape

(50000, 2)

In [43]:
df.size

100000

In [44]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [45]:
df.dropna(inplace=True) # only if dataset has some null values

In [46]:
df['sentiment'] = df['sentiment'].map({'positive':1,'negative':0})

In [47]:
reviews = df['review'].astype(str).tolist()
sentiments = df['sentiment'].astype(int).tolist()

In [48]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(reviews)
sequences = tokenizer.texts_to_sequences(reviews)
padded = pad_sequences(sequences,maxlen=100)

In [49]:
x = np.array(padded)
y = np.array(sentiments)

In [50]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [51]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding,Dropout,GlobalAveragePooling1D

In [52]:
model = Sequential()

model.add(Embedding(10000,128,input_length=100))
model.add(GlobalAveragePooling1D())
model.add(Dense(16,activation='relu',name="dense_1"))
model.add(Dropout(0.2))
model.add(Dense(16,activation="relu",name="dense_2"))
model.add(Dropout(0.2))
model.add(Dense(1,activation="sigmoid",name="output"))
model.compile(optimizer="adam",loss="binary_crossentropy",metrics=['accuracy'])




In [53]:
history = model.fit(x_train,y_train,epochs=10,validation_data=(x_test,y_test))

Epoch 1/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 6ms/step - accuracy: 0.7075 - loss: 0.5324 - val_accuracy: 0.8588 - val_loss: 0.3288
Epoch 2/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.8867 - loss: 0.2919 - val_accuracy: 0.8610 - val_loss: 0.3207
Epoch 3/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.9038 - loss: 0.2451 - val_accuracy: 0.8505 - val_loss: 0.3564
Epoch 4/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.9155 - loss: 0.2147 - val_accuracy: 0.8533 - val_loss: 0.3715
Epoch 5/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - accuracy: 0.9227 - loss: 0.1883 - val_accuracy: 0.8466 - val_loss: 0.4168
Epoch 6/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.9300 - loss: 0.1647 - val_accuracy: 0.8491 - val_loss: 0.4183
Epoch 7/10
[1m1

In [55]:
loss,acc = model.evaluate(x_test,y_test)
print(f"Accuracy:{acc}, Loss: {loss}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 859us/step - accuracy: 0.8381 - loss: 0.6556
Accuracy:0.8396999835968018, Loss: 0.6627138257026672


In [None]:
texts = [
    "This movie was absolutely wonderful and inspiring!",
    "I have never seen a worse film in my life.",
    "It was not good but okay",
    "This movie was absolutely wonderful and inspiring!",
    "I have never seen a worse film in my life.",
    "It was not good but okay",
    "The plot was engaging and the acting was top-notch.",
    "What a complete waste of time and money.",
    "I loved every minute of it - highly recommended!",
    "Nothing special, just another average movie.",
    "The script was terrible and the jokes fell flat.",
    "Absolutely stunning visuals and a heartwarming story.",
    "It had potential, but the execution was poor.",
    "A delightful surprise with great character development.",
    "Too slow and boring, I nearly fell asleep.",
    "It was decent, not the best but not the worst either."
]


In [59]:
for text in texts:
  seq = tokenizer.texts_to_sequences([text])
  pad = pad_sequences(seq,maxlen=100)
  pred = model.predict(pad)[0][0]
  print(text)
  if pred>0.5:
    print("positive")
  else:
    print("negative")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
This movie was absolutely wonderful and inspiring!
positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
I have never seen a worse film in my life.
negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
It was not good but okay
negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
This movie was absolutely wonderful and inspiring!
positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
I have never seen a worse film in my life.
negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
It was not good but okay
negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
The plot was engaging and the acting was top-notch.
positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
What a complete waste of time and money.
negative
[1m1/1[0m [32m━━━━