In [39]:
# Import libraries
import pandas as pd
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import warnings

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
 
warnings.filterwarnings(action = 'ignore')

In [53]:
stop_words = set(stopwords.words('english') + ['reuter', '\x03'])
lemmatizer = WordNetLemmatizer()
# stemmer = PorterStemmer()

def preprocessor(text: str):
    text = text.lower()

    table = str.maketrans('', '', string.punctuation)
    text = text.translate(table)

    text = re.sub(r'\d+', 'num', text)

    text = [word for word in text.split() if word not in stop_words]

    text = [lemmatizer.lemmatize(word) for word in text]
    
    # text = [stemmer.stem(word) for word in text]

    return " ".join(text)

df = pd.read_csv("Training-dataset.csv")

comedy_df = df.loc[df["comedy"] == 1]
cult_df = df.loc[df["cult"] == 1]
flashback_df = df.loc[df["flashback"] == 1]
historical_df = df.loc[df["historical"] == 1]
murder_df = df.loc[df["murder"] == 1]
revenge_df = df.loc[df["revenge"] == 1]
romantic_df = df.loc[df["romantic"] == 1]
scifi_df = df.loc[df["scifi"] == 1]
violence_df = df.loc[df["violence"] == 1]

sep_label_df = [comedy_df, cult_df, flashback_df,
    historical_df,
    murder_df,
    revenge_df,
    romantic_df,
    scifi_df,
    violence_df
]
col_val = 3
for i in sep_label_df:
    print(f"Number of '{i.columns[col_val]}' plots: {i.shape[0]}")
    col_val += 1

df['text'] = df['title'] + ' ' + df['plot_synopsis']
training_data = df[['text', 'comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']]
training_data.head()
 
def training_rows(data, perc=0.8):
    return data.head(int(len(data)*(perc)))
def testing_rows(data, train):    
    return data.iloc[len(train):]
train_id_set = []
test_id_set = []
for i in sep_label_df:
    i_train = training_rows(i)
    i_test = testing_rows(i, i_train)
    train_id_set.extend(i_train.index.unique())
    test_id_set.extend(i_test.index.unique())
    
train_id_set = set(train_id_set)
test_id_set = set(test_id_set)
training_data['preprocessed_text'] = training_data['text'].apply(preprocessor)
training_data.head()

Number of 'comedy' plots: 1262
Number of 'cult' plots: 1801
Number of 'flashback' plots: 1994
Number of 'historical' plots: 186
Number of 'murder' plots: 4019
Number of 'revenge' plots: 1657
Number of 'romantic' plots: 2006
Number of 'scifi' plots: 204
Number of 'violence' plots: 3064


Unnamed: 0,text,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence,preprocessed_text
0,Si wang ta After a recent amount of challenges...,0,0,0,0,1,1,0,0,1,si wang ta recent amount challenge billy lo br...
1,Shattered Vengeance In the crime-ridden city o...,0,0,0,0,1,1,1,0,1,shattered vengeance crimeridden city tremont r...
2,L'esorciccio Lankester Merrin is a veteran Cat...,0,1,0,0,0,0,0,0,0,lesorciccio lankester merrin veteran catholic ...
3,"Serendipity Through Seasons ""Serendipity Throu...",0,0,0,0,0,0,1,0,0,serendipity season serendipity season heartwar...
4,The Liability Young and naive 19-year-old slac...,0,0,1,0,0,0,0,0,0,liability young naive numyearold slacker adam ...


In [63]:
X_train = training_data.loc[train_id_set, "preprocessed_text"].values
y_train = training_data.loc[train_id_set, ['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']].values

X_test = training_data.loc[test_id_set, "preprocessed_text"].values
y_test = training_data.loc[test_id_set, ['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']].values


# # Tokenize and pad the input sequences
max_words = 15000  # Adjust as needed
max_len = 4000  # Adjust as needed
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)



In [64]:
embedding_dim = 100  # Adjust as needed
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(9, activation='sigmoid'))


In [65]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

epochs = 5  # Adjust as needed
batch_size = 16  # Adjust as needed
model.fit(X_train_pad, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x33cf24df0>

In [66]:
y_pred = model.predict(X_test_pad)

# Convert predicted probabilities to binary labels
y_pred_binary = (y_pred > 0.5).astype(int)

precision = precision_score(y_test, y_pred_binary, average='weighted')
recall = recall_score(y_test, y_pred_binary, average='weighted')
f1 = f1_score(y_test, y_pred_binary, average='weighted')

print(f'Weighted Precision: {precision:.4f}')
print(f'Weighted Recall: {recall:.4f}')
print(f'Weighted F1 Score: {f1:.4f}')

# Output the predicted labels as an ndarray
print('Predicted Labels:')
print(y_pred_binary)

Weighted Precision: 0.4671
Weighted Recall: 0.3900
Weighted F1 Score: 0.4213
Predicted Labels:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 1 ... 0 0 1]
 [0 0 0 ... 0 0 1]]
