In [19]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GlobalMaxPool1D
from sklearn.utils.class_weight import compute_class_weight

Load the Data

In [2]:
data = pd.read_csv("/Users/suhasmathey/Library/CloudStorage/OneDrive-UniversityofDelaware-o365/Summer Projects/Movie_Trend_Analysis/Movie-Trend-Analysis/imdb-movies-dataset.csv")

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/suhasmathey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/suhasmathey/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Preprocessing the Data - Removing the stop words and lemmatizing the words in the movie descriptions. 

In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    words = text.split() #Splitting text into words
    filtered_words = [word for word in words if word.lower() not in stop_words] #If the word is not in stop_words, it is included in filtered_words
    text = ''.join(filtered_words)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ''.join(lemmatized_words)

data['Description'] = data['Description'].apply(preprocess)

Tokenizing the words in the description 

Tokenizer - Breaking down text into smaller units, "tokens"

texts_to_sequence - Converting text into a sequence of integers

In [6]:
tokenizer = Tokenizer(num_words=7000)
tokenizer.fit_on_texts(data['Description'])
X = tokenizer.texts_to_sequences(data['Description'])
X = pad_sequences(X, maxlen=50)


Encoding genre labels into a binary format

.fillna('') - Fills cells with NaN values with empty strings, preventing from encoutering float objects

In [11]:
data['Genre'] = data['Genre'].fillna('')
data['Genre'] = data['Genre'].apply(lambda x: x.split(' , '))
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data['Genre'])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [21]:
model = Sequential()
model.add(Embedding(input_dim=7000, output_dim=150, input_length=50))
model.add(LSTM(128, return_sequences=True))
model.add(GlobalMaxPool1D())
model.add(Dense(128, activation='relu'))
model.add(Dense(y.shape[1], activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.3)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x302fc9c50>

In [22]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

Test Loss: 0.01267897430807352
Test Accuracy: 0.04500000178813934
