In [None]:
import pandas as pd
import io

from google.colab import drive
drive.mount('/content/gdrive')

train_path = '/content/gdrive/MyDrive/deep_learning/train.csv'
test_path = '/content/gdrive/MyDrive/deep_learning/test_all.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# 1. Convert the labels into a binary matrix
subgenre_list = []
for i in range(len(train_df)):
    subgenre_list.append(train_df['subgenres'][i])

def clean_subgenre(subgenres):
    # split the subgenre for multiple subgenres
    multiple_subgenres = subgenres.split(',')
    # clean text
    for i in range(len(multiple_subgenres)):
        multiple_subgenres[i] = multiple_subgenres[i].replace('[', '')
        multiple_subgenres[i] = multiple_subgenres[i].replace(']', '')
        multiple_subgenres[i] = multiple_subgenres[i].replace('\'', '')
        multiple_subgenres[i] = multiple_subgenres[i].replace(' ', '')
    subgenres = multiple_subgenres
    return subgenres

# cleaning the dataframe
for i in range(len(train_df)):
    subgenres = clean_subgenre(train_df['subgenres'][i])
    train_df['subgenres'][i] = subgenres

for i in range(len(test_df)):
    subgenres = clean_subgenre(test_df['subgenres'][i])
    test_df['subgenres'][i] = subgenres

# drop unnecessary columns
train_df = train_df.drop(columns=['id', 'artist', 'name', 'album', 'type'])
test_df = test_df.drop(columns=['id', 'artist', 'name', 'album', 'type'])

print(train_df.head())



# cleaning subgenre list
for i in range(len(subgenre_list)):
    subgenre_list[i] = clean_subgenre(subgenre_list[i])

def make_unique_subgenres(subgenre_list):
    # make unique list of subgenres
    unique_subgenres = []
    for i in range(len(subgenre_list)):
        for j in range(len(subgenre_list[i])):
            if subgenre_list[i][j] not in unique_subgenres:
                unique_subgenres.append(subgenre_list[i][j])

    return unique_subgenres

subgenre_list = make_unique_subgenres(subgenre_list)

print(len(subgenre_list))

for s in subgenre_list:
    print(s)

Mounted at /content/gdrive


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  train_df['subgenres'][i] = subgenres
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Se

                                           subgenres  \
0          [rock---alternativerock, rock---gothrock]   
1                                [rock---deathmetal]   
2                                   [rock---poprock]   
3                  [rock---poprock, rock---progrock]   
4  [electronic---downtempo, electronic---triphop,...   

                                              lyrics  
0  The words have been drained from this pencil S...  
1  And was not man created from the blood of Dago...  
2  You packed in the morning, I stared out the wi...  
3  Farewell Plymouth, your morning cold and grey ...  
4   Save me Save me Save me, oh  I've gotta stop ...  
53
rock---alternativerock
rock---gothrock
rock---deathmetal
rock---poprock
rock---progrock
electronic---downtempo
electronic---triphop
rock---hardrock
rock---vikingmetal
blues---countryblues
latin---reggaeton
electronic---newwave
electronic---synth-pop
blues---electricblues
latin---samba
rock---newwave
rock---artrock
rock---post-pu

In [None]:
# 2. Convert the labels into a binary matrix

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=subgenre_list)

y_train = mlb.fit_transform(train_df['subgenres'])
y_test = mlb.transform(test_df['subgenres'])

print(y_train.shape)
print(y_test.shape)

(11187, 53)
(3361, 53)


In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as stop_words



from gensim.models import Word2Vec
import gensim
# from nltk.tokenize import sent_tokenize, word_tokenize
# import nltk
import gensim.downloader as api

print("Loading Word2Vec model...")
model = api.load("word2vec-google-news-300")
print("Word2Vec model loaded.")

Loading Word2Vec model...
[--------------------------------------------------] 0.0% 0.3/1662.8MB downloaded

KeyboardInterrupt: 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.multioutput import ClassifierChain
# print("Loading nltk...")
# nltk.download('punkt')
# nltk.download('punkt_tab')
# print("nltk loaded.")

import numpy as np

# Function to get the average Word2Vec vector for a lyrics
def get_average_word2vec(lyrics, model, vector_size=300):
    words = lyrics.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]  # Only use words in the model vocabulary

    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    else:
        # Return a zero vector if no word in the lyrics is in the vocabulary
        return np.zeros(vector_size)

# Convert all lyricss to average Word2Vec vectors
print("Converting lyrics to Word2Vec vectors...")
X_train_w2v = np.array([get_average_word2vec(lyrics, model) for lyrics in train_df['lyrics']])
X_test_w2v = np.array([get_average_word2vec(lyrics, model) for lyrics in test_df['lyrics']])
print("Lyrics converted to Word2Vec vectors.")
print("")

print("Word2Vec Train data:",  len(X_train_w2v), len(y_train))
print("Word2Vec Test data:",  len(X_test_w2v), len(y_test))

Converting lyrics to Word2Vec vectors...
Lyrics converted to Word2Vec vectors.

Word2Vec Train data: 11187 11187
Word2Vec Test data: 3361 3361


In [None]:
lr_model = ClassifierChain(LogisticRegression(solver='sag', max_iter=1000))

print("Training Logistic Regression model...")
lr_model.fit(X_train_w2v, y_train)
y_pred = lr_model.predict(X_test_w2v)
print("Logistic Regression model trained.")

print("")
print("Logistic Regression:")
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred, average='micro'))
print("Recall: ", recall_score(y_test, y_pred, average='micro'))
print("Hamming Loss: ", hamming_loss(y_test, y_pred))

Training Logistic Regression model...
Logistic Regression model trained.

Logistic Regression:
Accuracy:  0.012496280868789051
Precision:  0.3739130434782609
Recall:  0.023089314480042956
Hamming Loss:  0.031852604514604255


In [None]:
from transformers import BertForSequenceClassification
from transformers import BertTokenizerFast

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(subgenre_list))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import AdamW
from torch import nn
from transformers import BertForSequenceClassification
from transformers import BertTokenizerFast
%pip install pytorch-lightning
import pytorch_lightning as pl

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(subgenre_list))

class LyricsDataset(Dataset):
    def __init__(self, lyrics, subgenres, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = lyrics
        self.labels = subgenres
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item_idx):
        text = self.text[item_idx]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length= self.max_len,
            padding = 'max_length',
            return_token_type_ids= False,
            return_attention_mask= True,
            truncation=True,
            return_tensors = 'pt'
          )

        input_ids = inputs['input_ids'].flatten()
        attn_mask = inputs['attention_mask'].flatten()

        return {
        'input_ids': input_ids ,
        'attention_mask': attn_mask,
        'label':torch.tensor(self.labels[item_idx],dtype= torch.float)

        }

class LyricsDataModule(pl.LightningDataModule):
    def __init__(self, train_lyrics, test_lyrics, train_subgenres, test_subgenres, tokenizer, max_len, batch_size):
        super().__init__()
        self.train_lyrics = train_lyrics
        self.test_lyrics = test_lyrics
        self.train_subgenres = train_subgenres
        self.test_subgenres = test_subgenres
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.batch_size = batch_size

    def setup(self, stage=None):
        self.train_dataset = LyricsDataset(
            lyrics = self.train_lyrics,
            subgenres = self.train_subgenres,
            tokenizer = self.tokenizer,
            max_len = self.max_len
        )

        self.test_dataset = LyricsDataset(
            lyrics = self.test_lyrics,
            subgenres = self.test_subgenres,
            tokenizer = self.tokenizer,
            max_len = self.max_len
        )

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size = self.batch_size, shuffle = True)

    def val_dataloader(self):
        return DataLoader(self.test_dataset, batch_size = self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size = self.batch_size)

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.6.0-py3-none-any.whl.metadata (20 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.11.9-py3-none-any.whl.metadata (5.2 kB)
Downloading pytorch_lightning-2.4.0-py3-none-any.whl (815 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.2/815.2 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.9-py3-none-any.whl (28 kB)
Downloading torchmetrics-1.6.0-py3-none-any.whl (926 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m926.4/926.4 kB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightning-utilities, torchmetrics, pytorch-lightning
Successfully installed lightning-utilities-0.11.9 pytorch-lightning-2.4.0 torchmetrics-1.6.0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
lyrics_data_module = LyricsDataModule(
    train_lyrics = train_df['lyrics'],
    test_lyrics = test_df['lyrics'],
    train_subgenres = y_train,
    test_subgenres = y_test,
    tokenizer = tokenizer,
    max_len = 512,
    batch_size = 8
)
lyrics_data_module.setup()

In [None]:
class LyricsClassifier(pl.LightningModule):
    def __init__(self, n_classess=53, steps_per_epoch=None, n_epochs=3, lr=2e-5):
        super(LyricsClassifier, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=n_classess)
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs
        self.lr = lr
        self.criterion = nn.BCEWithLogitsLoss()
    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids, attention_mask=attention_mask)
        return output
    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        output = self(input_ids, attention_mask)
        loss = self.criterion(output.logits, labels)
        self.log('train_loss', loss, prog_bar=True,logger=True)

        return {"loss" :loss, "predictions":output, "labels": labels }
    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        output = self(input_ids, attention_mask)
        loss = self.criterion(output.logits, labels)
        self.log('train_loss', loss, prog_bar=True,logger=True)
        return loss
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optimizer

In [None]:
from pytorch_lightning.callbacks import ModelCheckpoint

# saves a file like: input/LyricsModel-epoch=02-val_loss=0.32.ckpt
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',# monitored quantity
    filename='/content/gdrive/MyDrive/deep_learning/LyricsModel-{epoch:02d}-{val_loss:.2f}',
    save_top_k=3, #  save the top 3 models
    mode='min', # mode of the monitored quantity  for optimization
)

In [None]:
lyrics_classifier = LyricsClassifier()
# Instantiate the Model Trainer
trainer = pl.Trainer(max_epochs = 3, callbacks=[checkpoint_callback])
# Train the Classifier Model
trainer.fit(lyrics_classifier, lyrics_data_module)

# This is the code, but I'm training on colab

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type                          | Params | Mode 
--------------------------------------------------------------------
0 | bert      | Ber

Training: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:384: `ModelCheckpoint(monitor='val_loss')` could not find the monitored key in the returned metrics: ['train_loss', 'epoch', 'step']. HINT: Did you call `log('val_loss', value)` in the `LightningModule`?
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


In [None]:
trainer.test(lyrics_classifier, lyrics_data_module, verbose=True)

NameError: name 'trainer' is not defined