In [78]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from torch.nn import BCEWithLogitsLoss
import torch

In [79]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

stop_words = set(stopwords.words('english') + ['reuter', '\x03'])
lemmatizer = WordNetLemmatizer()
# stemmer = PorterStemmer()

def preprocessor(text: str):
    text = text.lower()

    table = str.maketrans('', '', string.punctuation)
    text = text.translate(table)

    text = re.sub(r'\d+', 'num', text)

    text = [word for word in text.split() if word not in stop_words]

    text = [lemmatizer.lemmatize(word) for word in text]
    
    # text = [stemmer.stem(word) for word in text]

    return " ".join(text)


In [80]:
df = pd.read_csv("Training-dataset.csv")

comedy_df = df.loc[df["comedy"] == 1]
cult_df = df.loc[df["cult"] == 1]
flashback_df = df.loc[df["flashback"] == 1]
historical_df = df.loc[df["historical"] == 1]
murder_df = df.loc[df["murder"] == 1]
revenge_df = df.loc[df["revenge"] == 1]
romantic_df = df.loc[df["romantic"] == 1]
scifi_df = df.loc[df["scifi"] == 1]
violence_df = df.loc[df["violence"] == 1]

sep_label_df = [comedy_df, cult_df, flashback_df,
    historical_df,
    murder_df,
    revenge_df,
    romantic_df,
    scifi_df,
    violence_df
]
col_val = 3
for i in sep_label_df:
    print(f"Number of '{i.columns[col_val]}' plots: {i.shape[0]}")
    col_val += 1

df['text'] = df['title'] + ' ' + df['plot_synopsis']
training_data = df[['text', 'comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']]
training_data.head() 

Number of 'comedy' plots: 1262
Number of 'cult' plots: 1801
Number of 'flashback' plots: 1994
Number of 'historical' plots: 186
Number of 'murder' plots: 4019
Number of 'revenge' plots: 1657
Number of 'romantic' plots: 2006
Number of 'scifi' plots: 204
Number of 'violence' plots: 3064


Unnamed: 0,text,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence
0,Si wang ta After a recent amount of challenges...,0,0,0,0,1,1,0,0,1
1,Shattered Vengeance In the crime-ridden city o...,0,0,0,0,1,1,1,0,1
2,L'esorciccio Lankester Merrin is a veteran Cat...,0,1,0,0,0,0,0,0,0
3,"Serendipity Through Seasons ""Serendipity Throu...",0,0,0,0,0,0,1,0,0
4,The Liability Young and naive 19-year-old slac...,0,0,1,0,0,0,0,0,0


In [81]:
def training_rows(data, perc=0.8):
    return data.head(int(len(data)*(perc)))
def testing_rows(data, train):    
    return data.iloc[len(train):]

In [82]:
train_id_set = []
test_id_set = []
for i in sep_label_df:
    i_train = training_rows(i)
    i_test = testing_rows(i, i_train)
    train_id_set.extend(i_train.index.unique())
    test_id_set.extend(i_test.index.unique())
    
train_id_set = set(train_id_set)
test_id_set = set(test_id_set)

In [83]:
training_data['preprocessed_text'] = training_data['text'].apply(preprocessor)
training_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data['preprocessed_text'] = training_data['text'].apply(preprocessor)


Unnamed: 0,text,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence,preprocessed_text
0,Si wang ta After a recent amount of challenges...,0,0,0,0,1,1,0,0,1,si wang ta recent amount challenge billy lo br...
1,Shattered Vengeance In the crime-ridden city o...,0,0,0,0,1,1,1,0,1,shattered vengeance crimeridden city tremont r...
2,L'esorciccio Lankester Merrin is a veteran Cat...,0,1,0,0,0,0,0,0,0,lesorciccio lankester merrin veteran catholic ...
3,"Serendipity Through Seasons ""Serendipity Throu...",0,0,0,0,0,0,1,0,0,serendipity season serendipity season heartwar...
4,The Liability Young and naive 19-year-old slac...,0,0,1,0,0,0,0,0,0,liability young naive numyearold slacker adam ...


In [84]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
encoded_data = tokenizer.batch_encode_plus(
    training_data['preprocessed_text'].values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=512,  # adjust as needed
    return_tensors='pt'
)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [85]:
labels = torch.tensor(training_data[['comedy', 'cult', 'flashback', 'historical', 'murder', 'revenge', 'romantic', 'scifi', 'violence']].values, dtype=torch.float32)

# Split the data
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    encoded_data['input_ids'],
    labels,
    random_state=42,
    test_size=0.2
)

In [86]:
# Create DataLoader for training and validation sets
train_dataset = torch.utils.data.TensorDataset(train_inputs, train_labels)
val_dataset = torch.utils.data.TensorDataset(val_inputs, val_labels)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Load pre-trained RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=9)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [87]:
# Set up optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_function = torch.nn.BCEWithLogitsLoss()




In [88]:
# Training loop
num_epochs = 1  # adjust as needed

for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        inputs, labels = batch
        outputs = model(inputs, labels=labels)
        loss = loss_function(outputs.logits, labels)
        loss.backward()
        optimizer.step()


In [91]:
import numpy as np

In [93]:
# Evaluation
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix

model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in val_dataloader:
        inputs, labels = batch
        outputs = model(inputs)
        predictions.extend(torch.sigmoid(outputs.logits).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Convert predictions to binary (0 or 1)
predictions_binary = (np.array(predictions) > 0.5).astype(int)

# Evaluate metrics (e.g., F1 score)
f1 = f1_score(true_labels, predictions_binary, average='micro')
print(f"F1 Score: {f1}")

F1 Score: 0.4489214489214489


In [95]:
accuracy = accuracy_score(true_labels, predictions_binary)
print(f'Accuracy: {accuracy:.5f}')

Accuracy: 0.13559
