
## 1. Imports

In [1]:
import numpy as np
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset


  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


## 2. Dataset load

In [2]:
#TEXT_FILE = "train.txt"
TEXT_FILE = "train_subset_1000.txt"
file = open(TEXT_FILE, "r", encoding="utf-8")
data = file.readlines()

data = [i.split("\t") for i in data]

df = [[row[2], row[4]] for row in data]  # Extract 2nd and 4th elements
df = pd.DataFrame(df, columns = ['genre', 'plot'])
print(df.head())

     genre                                               plot
0  romance  Sekhar (Tarun) is a graduate from IIM and work...
1   horror  Kris Fowles (Katie Cassidy) goes to the Spring...
2   horror  Cynthia is traumatized by the death of her bab...
3    crime  Four friends, Gangu (Jackie Shroff), Abdul (Na...
4    drama  Crisis in a middle-class family when the son f...


## 3. Pre-processing

In [3]:
# 3. Preprocessing
random_state=42
#MODEL_NAME = 'distilbert-base-cased'
MODEL_NAME= 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

# Define a function to tokenize the input data
def tokenize_function(examples):
    return tokenizer(examples["plot"], padding="max_length", truncation=True)

# Map genres to integers for classification
genre_labels = {genre: idx for idx, genre in enumerate(df['genre'].unique())}
df['genre_label'] = df['genre'].map(genre_labels)

# Convert the dataframe to Hugging Face dataset format
dataset = Dataset.from_pandas(df[['plot', 'genre_label']])

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

100%|██████████| 1/1 [00:06<00:00,  6.92s/ba]


## Train/test split


In [4]:
# 4. Train/Test Split
train_test_split = tokenized_dataset.train_test_split(test_size=0.2, seed=random_state)
train_dataset = train_test_split["train"].map(lambda examples: {'labels': examples['genre_label']})
test_dataset = train_test_split["test"].map(lambda examples: {'labels': examples['genre_label']})

100%|██████████| 800/800 [00:00<00:00, 1828.84ex/s]
100%|██████████| 200/200 [00:00<00:00, 1699.45ex/s]


## Load DistilBERT Model for Sequence Classification


In [5]:
num_labels = len(genre_labels)
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Define Metrics

In [6]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

## Train The Model

In [7]:
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_steps=500,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=False,
    seed=random_state
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  1%|▏         | 2/150 [01:40<2:01:48, 49.38s/it]

KeyboardInterrupt: 

## Train The Model

In [None]:
trainer.evaluate()

## Kfold Validation on Train Set

In [None]:
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader

k = 5  # Set number of default folds
def k_fold_cross_val(model, dataset, k):
    kf = KFold(n_splits=k, shuffle=True, random_state=random_state)
    accuracies = []
    
    for train_idx, test_idx in kf.split(dataset):
        train_split = dataset.select(train_idx)
        test_split = dataset.select(test_idx)
        
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_split,
            eval_dataset=test_split,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )
        
        trainer.train()
        metrics = trainer.evaluate()
        accuracies.append(metrics['eval_accuracy'])
    
    return np.mean(accuracies), np.std(accuracies)

mean_acc, std_acc = k_fold_cross_val(model, tokenized_dataset, k)
print(f"K-Fold Cross Validation Accuracy: {mean_acc:.4f} ± {std_acc:.4f}")

Accuracy: 0.44399999999999995 ± 0.03826225293941797
Precision: 0.5202823689800092 ± 0.05904099104616777
Recall: 0.44399999999999995 ± 0.03826225293941797
F1-Score: 0.4258776856515696 ± 0.041111269926876796


## Predict on Test Set

In [None]:
# 7. Load test set (without genre labels)
test_file = open("test_no_labels.txt", "r", encoding="utf-8")
test_data = test_file.readlines()

# Preprocess the test data similar to the training data
test_plots = [plot.strip() for plot in test_data]
test_df = pd.DataFrame(test_plots, columns=["plot"])
test_dataset = Dataset.from_pandas(test_df)

# Tokenize test data
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Make predictions
predictions = trainer.predict(tokenized_test_dataset)
predicted_labels = np.argmax(predictions.logits, axis=1)

# Map predicted labels back to genres
predicted_genres = [list(genre_labels.keys())[label] for label in predicted_labels]

# Save the results
with open("results.txt", "w", encoding="utf-8") as results_file:
    for genre in predicted_genres:
        results_file.write(genre + "\n")

Predictions saved to results.txt
