# ü§ñ Fine-tune BERT for Genre Classification (Google Colab GPU)

In [1]:
# ‚úÖ Install required packages (Google Colab only)
!pip install -q transformers datasets accelerate scikit-learn


[0m

In [2]:
import torch
print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("‚ö†Ô∏è GPU not available.")


CUDA Available: False
‚ö†Ô∏è GPU not available.


## üìä Load Dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Training_NLP_Model/for_training_no_nulls.csv')
df.dropna(subset=['title', 'description', 'genre'], inplace=True)

df['genre_list'] = df['genre'].apply(lambda x: [g.strip().lower() for g in x.split(',')])
df['text'] = df['title'] + " " + df['description']

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['genre_list'])
X = df['text'].tolist()

# Optional: Print a sample
print("‚úÖ Data loaded successfully!")
print("Sample input:", X[0])
print("Sample multi-label vector:", y[0])


‚úÖ Data loaded successfully!
Sample input: The Hunger Games WINNING MEANS FAME AND FORTUNE.LOSING MEANS CERTAIN DEATH.THE HUNGER GAMES HAVE BEGUN. . . .In the ruins of a place once known as North America lies the nation of Panem, a shining Capitol surrounded by twelve outlying districts. The Capitol is harsh and cruel and keeps the districts in line by forcing them all to send one boy and once girl between the ages of twelve and eighteen to participate in the annual Hunger Games, a fight to the death on live TV.Sixteen-year-old Katniss Everdeen regards it as a death sentence when she steps forward to take her sister's place in the Games. But Katniss has been close to dead before‚Äîand survival, for her, is second nature. Without really meaning to, she becomes a contender. But if she is to win, she will have to start making choices that weight survival against humanity and life against love.
Sample multi-label vector: [0 0 0 ... 0 0 0]


## ‚úèÔ∏è Tokenization and Dataset Prep

In [5]:
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

class BookDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

train_dataset = BookDataset(X_train, y_train)
val_dataset = BookDataset(X_val, y_val)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

## üß† Load BERT for Multi-Label Classification

In [6]:
import torch

from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=y.shape[1],
    problem_type="multi_label_classification"
).to("cuda" if torch.cuda.is_available() else "cpu")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## üèãÔ∏è Train the Model

In [None]:
### pip install -q --upgrade transformers


In [None]:
### !pip uninstall -y transformers
### !pip install -U transformers datasets accelerate --quiet


In [11]:
import transformers
print(transformers.__version__)


4.52.4


In [None]:
!pip install sentencepiece

In [15]:
from torch.utils.data import Dataset

class BookDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

    def __len__(self):
        return len(self.labels)

train_dataset = BookDataset(X_train, y_train)
val_dataset = BookDataset(X_val, y_val)


In [21]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=y.shape[1],  # number of genres
    problem_type="multi_label_classification"
).to("cuda" if torch.cuda.is_available() else "cpu")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:

import importlib
importlib.reload(__import__("transformers"))


<module 'transformers' from '/usr/local/lib/python3.11/dist-packages/transformers/__init__.py'>

In [25]:
import torch
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert_genre_classifier",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    logging_steps=50,
    save_steps=200,
    save_total_limit=1,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


Step,Training Loss
50,0.4568
100,0.1247
150,0.0568
200,0.0389
250,0.0303
300,0.0264
350,0.0236
400,0.0217
450,0.0204
500,0.0197


TrainOutput(global_step=10228, training_loss=0.01870388358738164, metrics={'train_runtime': 2400.4157, 'train_samples_per_second': 34.087, 'train_steps_per_second': 4.261, 'total_flos': 2.197937756238643e+16, 'train_loss': 0.01870388358738164, 'epoch': 2.0})

## üíæ Save Model

### After training finishes:

Evaluate the model

In [26]:
# üìä Evaluate the model on validation set
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 0.014372926205396652, 'eval_runtime': 78.7642, 'eval_samples_per_second': 129.856, 'eval_steps_per_second': 16.238, 'epoch': 2.0}


Save your fine-tuned model

In [28]:
model.save_pretrained("./bert_genre_classifier")
tokenizer.save_pretrained("./bert_genre_classifier")


('./bert_genre_classifier/tokenizer_config.json',
 './bert_genre_classifier/special_tokens_map.json',
 './bert_genre_classifier/vocab.txt',
 './bert_genre_classifier/added_tokens.json')

In [None]:
### Reusing the training data