In [5]:
import numpy as np
from keras import models, layers, optimizers
from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import pad_sequences, to_categorical
import pandas as pd
from sklearn import preprocessing
# Load the data
df=pd.read_csv("IMDB_Dataset.csv")



In [6]:
train_df = df.sample(frac=0.8, random_state=25)
test_df = df.drop(train_df.index)
print(train_df)
print(test_df)

                                                  review sentiment
13920  Kudos to Fawcett to taking on roles that, at t...  positive
23439  If you are a fan of early Duke movies, this Lo...  positive
11851  I'd love to give this movie a 10/10, but in it...  positive
8278   The credits at the end read "ALL directed by S...  negative
31258  This young filmmaker has a talent for capturin...  positive
...                                                  ...       ...
38608  I just got back from a screening a couple of h...  positive
48009  OK, I don't want to upset anyone who enjoyed t...  negative
9184   i just watched the movie i was afraid it's gon...  positive
49448  There are few films that leave me with the fee...  positive
11381  I watched this movie with my boyfriend, an avi...  negative

[40000 rows x 2 columns]
                                                  review sentiment
20     After the success of Die Hard and it's sequels...  positive
22     What an absolutely stunning m

In [None]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_df['review'].tolist())

# Convert the text data to sequences of integers
train_sequences = tokenizer.texts_to_sequences(train_df['review'].tolist())
test_sequences = tokenizer.texts_to_sequences(test_df['review'].tolist())

# Pad the sequences to a fixed length
max_length = 100
train_data = pad_sequences(train_sequences, maxlen=max_length)
test_data = pad_sequences(test_sequences, maxlen=max_length)

# Convert the labels to categorical
label_encoder = preprocessing.LabelEncoder()
train_labels= label_encoder.fit_transform(train_df['sentiment'])
#train_labels = to_categorical(train_df['sentiment'])


In [None]:
model = models.Sequential()
model.add(layers.Embedding(10000, 64, input_length=max_length))
model.add(layers.Flatten())
model.add(layers.Dropout(0.2)),
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))


In [None]:
 model.compile(optimizer='adam',  loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
history = model.fit(train_data, train_labels, epochs=5, batch_size=32, validation_split=0.2)


In [None]:
test_labels= label_encoder.fit_transform(test_df['sentiment'])
print(test_labels[2])
#test_labels = to_categorical(test_df['sentiment'])
test_loss, test_acc = model.evaluate(test_data, test_labels)
print('Test accuracy:', test_acc)


In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
predictions = model.predict(test_data)

In [None]:
text = tokenizer.sequences_to_texts(test_data)

In [None]:
pred = np.zeros(len(predictions))
for i, score in enumerate(predictions):
    pred[i] = np.round(score)
    
predicted_sentiments = ['positive' if label == 1 else 'negative' for label in pred]    

In [None]:
print(f"Review text: {text[5]}\n")
print(f"Review : {predicted_sentiments[5]}")
