In [1]:
import numpy as np
import pandas as pd
import torch
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer
from transformers import AutoTokenizer, AutoModelForMaskedLM
from tensorflow.keras.models import Sequential
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GlobalMaxPool1D, Bidirectional
# from sklearn.utils.class_weight import compute_class_weight

Load the Data

In [2]:
current_directory = os.getcwd()

file_path = os.path.join(current_directory, "imdb-movies-dataset.csv")
data = pd.read_csv(file_path)

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/suhasmathey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/suhasmathey/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/suhasmathey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Preprocessing the data in 'Description' Column - Removing the stop words and lemmatizing the words in the movie descriptions. 

In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    
    tokens = nltk.word_tokenize(text.lower())
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token.isalnum()]
    return ' '.join(filtered_tokens)

data['Description'] = data['Description'].apply(preprocess)

In [4]:
print(data['Description'].tolist())



Pre-Processing Genre Labels

In [5]:
data['Genre'] = data['Genre'].fillna('')
data['Genre'] = data['Genre'].apply(lambda x: x.split(' , '))
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data['Genre'])

Tokenizing the words in the description 

Tokenizer - Breaking down text into smaller units, "tokens"


In [6]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
texts = data['Description'].tolist()
encoded_inputs = tokenizer(texts, padding=True, truncation=True, max_length=200, return_tensors='pt')
print(encoded_inputs['input_ids'].shape)

torch.Size([10000, 44])


In [7]:
def load_glove_embeddings(filepath):
    embeddings_index = {}
    with open(filepath, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_filepath = os.path.join(current_directory, "glove.42B.300d.txt")
glove_embeddings = load_glove_embeddings(glove_filepath)



embedding_dim = 300
vocab_size = tokenizer.vocab_size
embedding_matrix = np.zeros((vocab_size, embedding_dim))

#Function to reconstruct_word_and_look up embeddings

def reconstruct_word_and_lookup(token, current_word, glove_embeddings):
    if token.startswith("##"):
        current_word += token[2:]
    else:
        if current_word:
            embedding_vector = glove_embeddings.get(current_word)
            return current_word, embedding_vector
        current_word = token
    return current_word, None

current_word = ""

for token, index in tokenizer.vocab.items():
    token_str = tokenizer.convert_ids_to_tokens(index)
    current_word, embedding_vector = reconstruct_word_and_lookup(token_str, current_word, glove_embeddings)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

if current_word:
    embedding_vector = glove_embeddings.get(current_word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

print(f"Embedding matrix shape: {embedding_matrix.shape}")

Embedding matrix shape: (30522, 300)


Encoding genre labels into a binary format

.fillna('') - Fills cells with NaN values with empty strings, preventing from encoutering float objects

In [8]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size,
                    output_dim = embedding_dim,
                    weights = [embedding_matrix],
                    input_length = 200,
                    trainable=False))

model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(len(mlb.classes_), activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

2024-07-04 15:55:57.827502: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2024-07-04 15:55:57.827531: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 18.00 GB
2024-07-04 15:55:57.827543: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 6.00 GB
2024-07-04 15:55:57.827557: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-07-04 15:55:57.827566: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(encoded_inputs['input_ids'], y, test_size=0.3, random_state=42)

In [10]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.3)

Epoch 1/10


2024-07-04 15:55:58.976656: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m482s[0m 3s/step - accuracy: 0.0157 - loss: 0.2500 - val_accuracy: 0.0557 - val_loss: 0.0136
Epoch 2/10
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m484s[0m 3s/step - accuracy: 0.0475 - loss: 0.0133 - val_accuracy: 0.0557 - val_loss: 0.0129
Epoch 3/10
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m485s[0m 3s/step - accuracy: 0.0503 - loss: 0.0128 - val_accuracy: 0.0557 - val_loss: 0.0127
Epoch 4/10
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m481s[0m 3s/step - accuracy: 0.0510 - loss: 0.0127 - val_accuracy: 0.0557 - val_loss: 0.0127
Epoch 5/10
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m483s[0m 3s/step - accuracy: 0.0544 - loss: 0.0126 - val_accuracy: 0.0557 - val_loss: 0.0126
Epoch 6/10
[1m  2/154[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m7:46[0m 3s/step - accuracy: 0.0000e+00 - loss: 0.0130

KeyboardInterrupt: 

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')