In [None]:
!pip install -q transformers datasets scikit-learn

In [None]:
from google.colab import files
uploaded = files.upload()

Saving music_labels_balanced_20x12.csv to music_labels_balanced_20x12.csv
Saving music_labels_test_set.csv to music_labels_test_set.csv


In [None]:
import pandas as pd
df = pd.read_csv("music_labels_balanced_20x12.csv")
df.head()


from datasets import Dataset
dataset = Dataset.from_pandas(df)


from transformers import AutoTokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True)



labels = list(set(df["label"]))
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

def encode_labels(example):
    example["label"] = label2id[example["label"]]
    return example

tokenized_dataset = tokenized_dataset.map(encode_labels)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/239 [00:00<?, ? examples/s]

Map:   0%|          | 0/239 [00:00<?, ? examples/s]

In [None]:

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)


import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

training_args = TrainingArguments(
    output_dir="music_model",
    per_device_train_batch_size=8,
    num_train_epochs=8,
    logging_dir="logs",
    logging_steps=10,
    save_strategy="epoch"
)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "f1": f1_score(p.label_ids, preds, average="weighted")
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    compute_metrics=compute_metrics,
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer.train()


model.save_pretrained("music_model")
tokenizer.save_pretrained("music_model")

from transformers import pipeline
clf = pipeline("text-classification", model="music_model", tokenizer="music_model")

example = "The knight draws his blade as the enemy approaches."
clf(example)


Step,Training Loss
10,2.4996
20,2.4057
30,2.343
40,2.057
50,1.8704
60,1.7372
70,1.4301
80,1.3037
90,1.0862
100,0.9315


Device set to use cuda:0


[{'label': 'battle', 'score': 0.9266848564147949}]

In [None]:

from transformers import pipeline

clf = pipeline("text-classification", model="music_model", tokenizer="music_model")

examples = [
    "The knight draws his blade as the enemy approaches.",
    "They sat under the stars, enjoying a calm evening.",
    "Blood covered the arena as the champion stood victorious.",
    "She heard a whisper coming from the dark cellar.",
    "The lovers embraced before he went to war.",
    "The old wizard raised his staff and chanted in an ancient tongue.",
    "Villagers sang and danced during the harvest festival.",
    "He marched alone through the fog toward the looming castle."
]

for text in examples:
    prediction = clf(text)[0]
    print(f"📝 Text: {text}")
    print(f"🎵 Predicted label: {prediction['label']} (score: {prediction['score']:.2f})\\n")


Device set to use cuda:0


📝 Text: The knight draws his blade as the enemy approaches.
🎵 Predicted label: battle (score: 0.93)\n
📝 Text: They sat under the stars, enjoying a calm evening.
🎵 Predicted label: romance (score: 0.44)\n
📝 Text: Blood covered the arena as the champion stood victorious.
🎵 Predicted label: victory (score: 0.59)\n
📝 Text: She heard a whisper coming from the dark cellar.
🎵 Predicted label: horror (score: 0.58)\n
📝 Text: The lovers embraced before he went to war.
🎵 Predicted label: romance (score: 0.83)\n
📝 Text: The old wizard raised his staff and chanted in an ancient tongue.
🎵 Predicted label: magic (score: 0.88)\n
📝 Text: Villagers sang and danced during the harvest festival.
🎵 Predicted label: happy (score: 0.84)\n
📝 Text: He marched alone through the fog toward the looming castle.
🎵 Predicted label: travel (score: 0.45)\n


In [None]:


import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch


model = AutoModelForSequenceClassification.from_pretrained("music_model")
tokenizer = AutoTokenizer.from_pretrained("music_model")


def predict_top_labels(text, top_n=2):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits
        probs = F.softmax(logits, dim=-1)[0]


    values, indices = torch.topk(probs, k=top_n)
    results = [(model.config.id2label[i.item()], v.item()) for i, v in zip(indices, values)]
    return results


test_texts = [
    "They sat under the stars, enjoying a calm evening.",
    "The knight draws his blade as the enemy approaches.",
    "She boarded the airship as the city shrank below her.",
    "He stepped carefully over bones that cracked underfoot.",
    "They held hands in silence, both smiling at nothing."
]

for text in test_texts:
    print(f"📝 {text}")
    for label, score in predict_top_labels(text, top_n=2):
        print(f"🎵 {label} (score: {score:.2f})")
    print()

📝 They sat under the stars, enjoying a calm evening.
🎵 romance (score: 0.44)
🎵 calm (score: 0.35)

📝 The knight draws his blade as the enemy approaches.
🎵 battle (score: 0.93)
🎵 tense (score: 0.02)

📝 She boarded the airship as the city shrank below her.
🎵 travel (score: 0.93)
🎵 tense (score: 0.01)

📝 He stepped carefully over bones that cracked underfoot.
🎵 dark (score: 0.89)
🎵 horror (score: 0.02)

📝 They held hands in silence, both smiling at nothing.
🎵 romance (score: 0.87)
🎵 sad (score: 0.03)

