In [None]:
!pip install -qU torch torchvision torchaudio transformers accelerate peft datasets scikit-learn pandas tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.2/865.2 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.1/393.1 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m134.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m102.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.7/897.7 kB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m571.0/571.0 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.2/200.2 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("/content/drive/MyDrive/NLP/Datasets/lyrics_song_info.csv")

df = df.dropna(subset=["lyrics", "tags"])

df["primary_genre"] = df["tags"].str.split(";").str[0]

TOP_N = 100
top_genres = df["primary_genre"].value_counts().nlargest(TOP_N).index
df = df[df["primary_genre"].isin(top_genres)].reset_index(drop=True)

train_df, val_df = train_test_split(
    df, test_size=0.1, stratify=df["primary_genre"], random_state=42
)
print(f"Train: {len(train_df)} rows,  Val: {len(val_df)} rows")


Train: 32055 rows,  Val: 3562 rows


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import re

def tokenize(text):
    return re.findall(r"\w+'?\w*|[.,!?;]", text.lower())

counter = Counter()
for lyric in train_df["lyrics"]:
    counter.update(tokenize(lyric))
vocab = {tok: i+2 for i, (tok, _) in enumerate(counter.most_common(20_000))}
vocab["<pad>"] = 0
vocab["<unk>"] = 1

labels = sorted(train_df["primary_genre"].unique())
label2id = {lab: i for i, lab in enumerate(labels)}

class LyricsDataset(Dataset):
    def __init__(self, df, max_len=512):
        self.texts = df["lyrics"].tolist()
        self.targets = [label2id[l] for l in df["primary_genre"]]
        self.max_len = max_len

    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        toks = tokenize(self.texts[idx])[: self.max_len]
        ids = [vocab.get(t, vocab["<unk>"]) for t in toks]
        return torch.tensor(ids, dtype=torch.long), self.targets[idx]

def collate_batch(batch):
    texts, labs = zip(*batch)
    lengths = [len(x) for x in texts]
    max_l = max(lengths)
    padded = torch.zeros(len(texts), max_l, dtype=torch.long)
    for i, x in enumerate(texts):
        padded[i, : lengths[i]] = x
    return padded, torch.tensor(labs)

train_ds = LyricsDataset(train_df)
val_ds   = LyricsDataset(val_df)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_batch)
val_loader   = DataLoader(val_ds,   batch_size=32, shuffle=False, collate_fn=collate_batch)


In [None]:
import torch.nn as nn

class BiLSTM_GRU_Classifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, padding_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=padding_idx)
        self.bilstm    = nn.LSTM(embed_dim, hidden_dim,
                                 bidirectional=True, batch_first=True)
        self.gru       = nn.GRU(2*hidden_dim, hidden_dim, batch_first=True)
        self.classifier= nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        emb, _ = self.bilstm(self.embedding(x))
        out, _ = self.gru(emb)
        feat    = out.mean(dim=1)
        return self.classifier(feat)

model_cls = BiLSTM_GRU_Classifier(
    vocab_size=len(vocab), embed_dim=128, hidden_dim=256, num_classes=len(labels)
).to("cuda")


In [None]:
import torch.optim as optim
from tqdm import tqdm

optimizer = optim.Adam(model_cls.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

def run_epoch(loader, train=True):
    model_cls.train() if train else model_cls.eval()
    total_loss, total_acc = 0, 0
    for x, y in tqdm(loader, desc="Train" if train else "Val"):
        x, y = x.to("cuda"), y.to("cuda")
        logits = model_cls(x)
        loss = criterion(logits, y)
        if train:
            optimizer.zero_grad(); loss.backward(); optimizer.step()
        total_loss += loss.item() * x.size(0)
        total_acc  += (logits.argmax(1)==y).sum().item()
    n = len(loader.dataset)
    return total_loss/n, total_acc/n

for epoch in range(1, 6):
    tr_loss, tr_acc = run_epoch(train_loader, train=True)
    vl_loss, vl_acc = run_epoch(val_loader,   train=False)
    print(f"Epoch {epoch} → train {tr_loss:.3f}/{tr_acc:.3f},  val {vl_loss:.3f}/{vl_acc:.3f}")


Train: 100%|██████████| 1002/1002 [00:56<00:00, 17.69it/s]
Val: 100%|██████████| 112/112 [00:03<00:00, 34.92it/s]


Epoch 1 → train 2.260/0.501,  val 2.064/0.520


Train: 100%|██████████| 1002/1002 [00:55<00:00, 17.93it/s]
Val: 100%|██████████| 112/112 [00:03<00:00, 34.81it/s]


Epoch 2 → train 1.936/0.538,  val 1.913/0.543


Train: 100%|██████████| 1002/1002 [00:55<00:00, 17.91it/s]
Val: 100%|██████████| 112/112 [00:03<00:00, 34.83it/s]


Epoch 3 → train 1.734/0.566,  val 1.852/0.546


Train: 100%|██████████| 1002/1002 [00:56<00:00, 17.86it/s]
Val: 100%|██████████| 112/112 [00:03<00:00, 34.60it/s]


Epoch 4 → train 1.510/0.605,  val 1.898/0.538


Train: 100%|██████████| 1002/1002 [00:56<00:00, 17.84it/s]
Val: 100%|██████████| 112/112 [00:03<00:00, 34.78it/s]

Epoch 5 → train 1.225/0.668,  val 2.024/0.531





In [None]:
torch.save(model_cls.state_dict(), "/content/drive/MyDrive/NLP/bilstm_gru_classifier_100.pt")