In [3]:
import os
import json

from zipfile import ZipFile
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
kaggle_dictionary=json.load(open("kaggle.json"))

In [6]:
os.environ["KAGGLE_USERNAME"]= kaggle_dictionary["username"]
os.environ["KAGGLE_KEY"]= kaggle_dictionary["key"]

In [9]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to C:\Users\theon\Jupyter Projects\IMDB Sentimental Analysis




  0%|          | 0.00/25.7M [00:00<?, ?B/s]
  4%|3         | 1.00M/25.7M [00:01<00:40, 646kB/s]
  8%|7         | 2.00M/25.7M [00:01<00:19, 1.30MB/s]
 12%|#1        | 3.00M/25.7M [00:01<00:11, 2.08MB/s]
 19%|#9        | 5.00M/25.7M [00:02<00:05, 3.93MB/s]
 27%|##7       | 7.00M/25.7M [00:02<00:03, 5.98MB/s]
 35%|###5      | 9.00M/25.7M [00:02<00:02, 7.96MB/s]
 43%|####2     | 11.0M/25.7M [00:02<00:01, 10.1MB/s]
 51%|#####     | 13.0M/25.7M [00:02<00:01, 11.2MB/s]
 58%|#####8    | 15.0M/25.7M [00:02<00:00, 12.8MB/s]
 66%|######6   | 17.0M/25.7M [00:02<00:00, 14.0MB/s]
 74%|#######3  | 19.0M/25.7M [00:03<00:00, 14.9MB/s]
 82%|########1 | 21.0M/25.7M [00:03<00:00, 14.9MB/s]
 89%|########9 | 23.0M/25.7M [00:03<00:00, 15.5MB/s]
 97%|#########7| 25.0M/25.7M [00:03<00:00, 16.3MB/s]
100%|##########| 25.7M/25.7M [00:03<00:00, 7.82MB/s]


In [10]:
with ZipFile("imdb-dataset-of-50k-movie-reviews.zip","r") as zip_ref:
    zip_ref.extractall()

In [11]:
data=pd.read_csv("IMDB Dataset.csv")

In [12]:
pd.set_option('future.no_silent_downcasting', True)
data.replace({"sentiment":{"positive":1,"negative":0}},inplace=True)

In [13]:
train_data, test_data=train_test_split(data,test_size=0.25,random_state=43)

In [15]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\theon\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\theon\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\theon\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\theon\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [16]:
lemmatizer = WordNetLemmatizer()

In [17]:
train_data_l=train_data
test_data_l=test_data

In [18]:
def lemmatize_review(review):
    words=word_tokenize(review)
    lemmatized_review=' '.join(lemmatizer.lemmatize(word, pos='v') for word in words)
    return lemmatized_review

train_data_l['lemmatized_review'] = train_data['review'].apply(lemmatize_review)
test_data_l['lemmatized_review'] = test_data['review'].apply(lemmatize_review)

In [19]:
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words=set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])

train_data_l['lemmatized_review'] = train_data_l['lemmatized_review'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\theon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data_l['lemmatized_review'])

In [21]:
x_train = pad_sequences(tokenizer.texts_to_sequences(train_data['lemmatized_review']), maxlen=200)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_data['lemmatized_review']), maxlen=200)

In [22]:
y_train=train_data["sentiment"]
y_test=test_data["sentiment"]

In [23]:
y_train = np.array(y_train.astype(int))
y_test = np.array(y_test.astype(int))

In [24]:
model=Sequential()
model.add(Embedding(input_dim=5000,output_dim=128))
model.add(LSTM(128,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1,activation="sigmoid"))

In [25]:
model.build(input_shape=(None, 200))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         640000    
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 771,713
Trainable params: 771,713
Non-trainable params: 0
_________________________________________________________________


In [26]:
model.compile(optimizer="adam",loss="binary_crossentropy",metrics=["accuracy"])

In [None]:
history=model.fit(x_train,y_train,epochs=10,batch_size=64,validation_split=0.2)

Epoch 1/10

In [None]:

loss,accuracy=model.evaluate(x_test,y_test)
print(f"Test Loss: {loss}\nTest Accuracy: {accuracy}")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='best')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc='best')

plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import classification_report

y_pred = (model.predict(x_test) > 0.5).astype("int32")

report = classification_report(y_test, y_pred, target_names=['negative', 'positive'])

print(report)

In [None]:
def lemmatize_review(review):
    words = review.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return lemmatized_words 

In [None]:
def predict_sentiment(review):
    lemmantized_review=lemmatize_review(review)
    sequence = tokenizer.texts_to_sequences([' '.join(lemmantized_review)])
    padded_sequence = pad_sequences(sequence, maxlen=200)
    predictions = model.predict(padded_sequence)
    sentiment = "positive" if predictions[0][0] > 0.5 else "negative"
    print(predictions[0][0])
    return sentiment