## Imports

In [67]:
import torch
import pandas as pd
import matplotlib.pyplot as plt
import re
from torch.utils.data import Dataset, DataLoader, random_split
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
nltk.download('punkt')  # For tokenization
nltk.download('stopwords')  # For stopwords
nltk.download('averaged_perceptron_tagger')  # For POS tagging
nltk.download('wordnet')  # For lemmatization
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Akshay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Akshay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Akshay\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Akshay\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Akshay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Preprocessing 

In [19]:
df=pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [23]:
labels=[]
i=0
for index,row in df.iterrows():
    #print(row['sentiment'])
    if row['sentiment']=='positive':
        labels.append(1)
    else:
        labels.append(0)
    
    


In [25]:
labels.count(0)

25000

In [26]:
custom_stopwords = set([
    "html", "head", "body", "title", "div", "span", "p", "br", "href", "class", "id", "script", "style", "meta",
    "form", "input", "button", "ul", "li", "ol", "a", "img", "src", "alt", "strong", "table", "tr", "td", "th",
    "link", "rel", "type", "nav", "footer", "header", "article", "section", "aside", "main", "label", "textarea"
    ,"<" , "." , ">" ,"'s" ,"," , "(" , ")" ,"``"
])



In [27]:
stop_words = set(stopwords.words('english')).union(custom_stopwords)

In [28]:
df["word_tokens"] = df["review"].apply(lambda x: word_tokenize(str(x)))

df["filtered_tokens"] = df["word_tokens"].apply(lambda x: [word.lower() for word in x if word.lower() not in stop_words])


In [30]:
print(df[["filtered_tokens"]].head())


                                     filtered_tokens
0  [one, reviewers, mentioned, watching, 1, oz, e...
1  [wonderful, little, production, /, /, filming,...
2  [thought, wonderful, way, spend, time, hot, su...
3  [basically, family, little, boy, jake, thinks,...
4  [petter, mattei, love, time, money, '', visual...


In [31]:
lemmatizer = WordNetLemmatizer()

df["lemmatized_tokens"] = df["filtered_tokens"].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])

In [32]:
print(df[["lemmatized_tokens"]].head())

                                   lemmatized_tokens
0  [one, reviewer, mentioned, watching, 1, oz, ep...
1  [wonderful, little, production, /, /, filming,...
2  [thought, wonderful, way, spend, time, hot, su...
3  [basically, family, little, boy, jake, think, ...
4  [petter, mattei, love, time, money, '', visual...


In [33]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>") # top 10000, oov replace
tokenizer.fit_on_texts(df["filtered_tokens"].astype(str))  #
sequences = tokenizer.texts_to_sequences(df["filtered_tokens"].astype(str))


In [43]:
max_length = 200  
data = pad_sequences(sequences, maxlen=max_length, padding="post", truncating="post")


In [71]:
labels =  np.array(labels)

In [72]:
train_data , test_data , train_labels , test_labels = train_test_split(data,labels,test_size = 0.2,shuffle = True , random_state = 69) 

## Model

In [73]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=128),
    LSTM(64,return_sequences = False),
    Dense(1,activation = 'sigmoid')
])

In [74]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])


## Training the model


In [77]:
model.fit(train_data , train_labels , epochs=2, batch_size=32, validation_split=0.2) # total 5 times

Epoch 1/2
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 118ms/step - accuracy: 0.9184 - loss: 0.2258 - val_accuracy: 0.8781 - val_loss: 0.2961
Epoch 2/2
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 120ms/step - accuracy: 0.9516 - loss: 0.1506 - val_accuracy: 0.8773 - val_loss: 0.3293


<keras.src.callbacks.history.History at 0x22d89676ba0>

In [78]:
loss, accuracy = model.evaluate(test_data, test_labels, batch_size=32)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 43ms/step - accuracy: 0.8848 - loss: 0.3259
Test Loss: 0.3228
Test Accuracy: 0.8827
