In [51]:
import pandas as pd
import re
train_df = pd.read_csv('datasets\\annotations\\train.csv')
test_df = pd.read_csv('datasets\\annotations\\test.csv')

print(train_df.head())

                       id             artist                   name  \
0  6vNcgPTLPyXflD733MmkAO           Warpaint               Undertow   
1  3ugQJQiDJwVKM8szrpAYNa  The Exploding Boy      Cracked / Reasons   
2  5utkTv4TSvgh9fjG386a84           Sanctity    Beneath The Machine   
3  48jyqLwgvWTQcJrlBB27nO        Gipsy Kings            Petite noya   
4  5xmqcLBNjrzOIancAp3bfX        Saxon Shore  This Shameless Moment   

                                album   type  genre  \
0                            The Fool  train   rock   
1                                Four  train   rock   
2                          Once Again  train   rock   
3                    The Very Best Of  train  latin   
4  The Exquisite Death of Saxon Shore  train   rock   

                             subgenres lyrics  
0                      rock---shoegaze    NaN  
1   rock---goth rock, rock---post-punk    NaN  
2  rock---heavy metal, rock---nu metal    NaN  
3                     latin---flamenco    NaN  
4 

In [52]:
train_df = train_df.drop(columns=['id', 'artist', 'name', 'album', 'type'])
test_df = test_df.drop(columns=['id', 'artist', 'name', 'album', 'type'])

train_df = train_df.drop(columns=["subgenres"])
test_df = test_df.drop(columns=["subgenres"])

print(train_df.head())

   genre lyrics
0   rock    NaN
1   rock    NaN
2   rock    NaN
3  latin    NaN
4   rock    NaN


In [53]:
train_df = train_df.dropna(subset=['lyrics'])
test_df = test_df.dropna(subset=['lyrics'])

print(train_df.head())
print(test_df.head())

         genre                                             lyrics
5        blues  By lorries along sir John Rogerson's quay Mr B...
6   electronic  \r\nBlue skies and green fields\r\nI'm thinkin...
8         rock  Fly, fly high my Black Eagle\r\nLet golden thr...
13     hip hop  Why is it ladies only out for money?\r\nBrothe...
18  electronic   Like...\r\n Turn it on, light it up, we gon s...
        genre                                             lyrics
0  electronic  \r\nEveryday After Work\r\nI Go To A Book Stor...
1  electronic  \r\nMy little girl, drive anywhere\r\nDo what ...
2  electronic  \r\nI was trapped under concrete\r\nBuilt from...
3  electronic  \r\nNa, na, na, na, na, na\r\nNa, na, na, na, ...
4  electronic  :\r\nF-A-I-L-U-R-E\r\nWoo\r\n:\r\nWe might as ...


In [54]:
genres = train_df['genre'].unique()

print(genres)
print(len(genres))

['blues' 'electronic' 'rock' 'hip hop' 'funk / soul' 'pop' 'latin' 'jazz'
 'classical']
9


In [None]:
# Converting genres to numerical values
label_dict = {}
for index, possible_label in enumerate(genres):
    label_dict[possible_label] = index
label_dict

print(label_dict)

train_df['genre'] = train_df["genre"].replace(label_dict)
test_df['genre'] = test_df["genre"].replace(label_dict)

print(train_df.head())

{'blues': 0, 'electronic': 1, 'rock': 2, 'hip hop': 3, 'funk / soul': 4, 'pop': 5, 'latin': 6, 'jazz': 7, 'classical': 8}
    genre                                             lyrics
5       0  By lorries along sir John Rogerson's quay Mr B...
6       1  \r\nBlue skies and green fields\r\nI'm thinkin...
8       2  Fly, fly high my Black Eagle\r\nLet golden thr...
13      3  Why is it ladies only out for money?\r\nBrothe...
18      1   Like...\r\n Turn it on, light it up, we gon s...


In [61]:
# Remove special characters
def remove_special_characters(text):
    return re.sub(r'[^A-Za-z0-9 ]+', '', text)

train_df['lyrics'] = train_df['lyrics'].apply(remove_special_characters)
test_df['lyrics'] = test_df['lyrics'].apply(remove_special_characters)

print(train_df.head())

    genre                                             lyrics
5       0  By lorries along sir John Rogersons quay Mr Bl...
6       1  Blue skies and green fieldsIm thinking of the ...
8       2  Fly fly high my Black EagleLet golden thread b...
13      3  Why is it ladies only out for moneyBrothers on...
18      1   Like Turn it on light it up we gon set this o...


In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_embeddings = tokenizer(train_df['lyrics'].tolist(), padding=True, truncation=True, return_tensors='pt')
test_embeddings = tokenizer(test_df['lyrics'].tolist(), padding=True, truncation=True, return_tensors='pt')



In [63]:
from torch.utils.data import Dataset, DataLoader
import torch

class LyricsDataset(Dataset):
    def __init__(self, encodings, labels):
        super().__init__()
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item


train_labels = train_df["genre"].values
train_dataset = LyricsDataset(train_embeddings, train_labels)
test_labels = test_df["genre"].values
test_dataset = LyricsDataset(test_embeddings, test_labels)

In [None]:
from transformers import BertForSequenceClassification
from torch import optim
from transformers import Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support, accuracy_score


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
num_epochs = 10
num_classes = 9
learning_rate = 0.001

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)


optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

training_arguments = TrainingArguments(
    output_dir='./lyrics_results',
    do_eval=True,
    per_device_train_batch_size=8,
    per_gpu_eval_batch_size=8,
    num_train_epochs=num_epochs,
    load_best_model_at_end=True,
    logging_steps=400,               # log & save weights each logging_steps
    save_steps=400,
    evaluation_strategy="steps",     # evaluate each `logging_steps'   
)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()

  0%|          | 0/4120 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{'loss': 1.7577, 'grad_norm': 6.282198429107666, 'learning_rate': 4.514563106796117e-05, 'epoch': 0.97}


  0%|          | 0/74 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{'eval_loss': 2.0115902423858643, 'eval_accuracy': 0.2893401015228426, 'eval_f1': 0.20547277722553925, 'eval_precision': 0.2211280754261468, 'eval_recall': 0.2893401015228426, 'eval_runtime': 2127.1912, 'eval_samples_per_second': 0.278, 'eval_steps_per_second': 0.035, 'epoch': 0.97}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{'loss': 1.5902, 'grad_norm': 8.180068969726562, 'learning_rate': 4.029126213592233e-05, 'epoch': 1.94}


  0%|          | 0/74 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{'eval_loss': 1.7999926805496216, 'eval_accuracy': 0.36209813874788493, 'eval_f1': 0.32106060873382875, 'eval_precision': 0.31911960845600956, 'eval_recall': 0.36209813874788493, 'eval_runtime': 389.74, 'eval_samples_per_second': 1.516, 'eval_steps_per_second': 0.19, 'epoch': 1.94}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [None]:
trainer.evaluate()