In [8]:
import numpy as np
import pandas as pd
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GlobalMaxPool1D
from sklearn.utils.class_weight import compute_class_weight

Load the Data

In [9]:
current_directory = os.getcwd()

file_path = os.path.join(current_directory, "imdb-movies-dataset.csv")
data = pd.read_csv(file_path)

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/suhasmathey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/suhasmathey/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Preprocessing the Data - Removing the stop words and lemmatizing the words in the movie descriptions. 

In [10]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    words = text.split() #Splitting text into words
    filtered_words = [word for word in words if word.lower() not in stop_words] #If the word is not in stop_words, it is included in filtered_words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    return ' '.join(lemmatized_words)

data['Description'] = data['Description'].apply(preprocess)

In [11]:
print(data['Description'])

0       Solène, 40-year-old single mom, begin unexpect...
1       Many year reign Caesar, young ape go journey l...
2       1963 Michigan, business rival Kellogg's Post c...
3       down-and-out stuntman must find missing star e...
4       Tashi, former tennis prodigy turned coach, tur...
                              ...                        
9995    dramatic life trapeze artists, clown, elephant...
9996    lone sellsword named Guts get recruited mercen...
9997    couple twelve-year-old Norwegian girl struggle...
9998    journalist strike romantic relationship notori...
9999    widow widower find relationship developing lov...
Name: Description, Length: 10000, dtype: object


Tokenizing the words in the description 

Tokenizer - Breaking down text into smaller units, "tokens"

texts_to_sequence - Converting text into a sequence of integers

In [12]:
tokenizer = Tokenizer(num_words=7000)
tokenizer.fit_on_texts(data['Description'])
word_index = tokenizer.word_index
X = tokenizer.texts_to_sequences(data['Description'])
X = pad_sequences(X, maxlen=50)
print(X.shape)

(10000, 50)


In [13]:
def load_glove_embeddings(filepath):
    embeddings_index = {}
    with open(filepath, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_filepath = os.path.join(current_directory, "glove.42B.300d.txt")
glove_embeddings = load_glove_embeddings(glove_filepath)



embedding_dim = 300
num_words = min(7000, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    if i >= num_words:
        continue
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        

Encoding genre labels into a binary format

.fillna('') - Fills cells with NaN values with empty strings, preventing from encoutering float objects

In [14]:
data['Genre'] = data['Genre'].fillna('')
data['Genre'] = data['Genre'].apply(lambda x: x.split(' , '))
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data['Genre'])

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [16]:
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=embedding_dim, weights = [embedding_matrix], input_length=50))
model.add(LSTM(128, return_sequences=True))
model.add(GlobalMaxPool1D())
model.add(Dense(128, activation='relu'))
model.add(Dense(y.shape[1], activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.3)

2024-06-17 23:20:57.594316: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2024-06-17 23:20:57.594423: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 18.00 GB
2024-06-17 23:20:57.594444: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 6.00 GB
2024-06-17 23:20:57.594942: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-17 23:20:57.595515: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/20


2024-06-17 23:20:58.978937: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x3fe680890>

In [17]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

Test Loss: 0.011853113770484924
Test Accuracy: 0.07400000095367432
