In [None]:
import numpy as np
import pandas as pd
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GlobalMaxPool1D
from sklearn.utils.class_weight import compute_class_weight

Load the Data

In [None]:
current_directory = os.getcwd()

file_path = os.path.join(current_directory, "imdb-movies-dataset.csv")
data = pd.read_csv(file_path)

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

Preprocessing the data in 'Description' Column - Removing the stop words and lemmatizing the words in the movie descriptions. 

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    
    tokens = nltk.word_tokenize(text.lower())
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token.isalnum()]
    return ' '.join(filtered_tokens)

data['Description'] = data['Description'].apply(preprocess)

In [None]:
print(data['Description'])

Pre-Processing Genre Labels

In [None]:
data['Genre'] = data['Genre'].fillna('')
data['Genre'] = data['Genre'].apply(lambda x: x.split(' , '))
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data['Genre'])

Tokenizing the words in the description 

Tokenizer - Breaking down text into smaller units, "tokens"

texts_to_sequence - Converting text into a sequence of integers

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['Description'])
word_index = tokenizer.word_index
X = tokenizer.texts_to_sequences(data['Description'])
PS = pad_sequences(X, maxlen=200)
print(PS.shape)

In [None]:
def load_glove_embeddings(filepath):
    embeddings_index = {}
    with open(filepath, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_filepath = os.path.join(current_directory, "glove.42B.300d.txt")
glove_embeddings = load_glove_embeddings(glove_filepath)



embedding_dim = 300
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        

Encoding genre labels into a binary format

.fillna('') - Fills cells with NaN values with empty strings, preventing from encoutering float objects

In [None]:
model = Sequential()
model.add(Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], input_length=200, trainable=False))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(mlb.classes_), activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(PS, y, test_size=0.3, random_state=42)

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.3)

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')