In [1]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from huggingface_hub import login
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from datasets import load_dataset

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [16]:
path = "../data/musiccaps_with_concept_tags/train.csv"
df = pd.read_csv(path) 
df

Unnamed: 0,caption,aspect_list,tempo_tags,genre_tags,mood_tags,instrument_tags
0,The low quality recording features a live perf...,"low quality, mono, noisy, live performance, ba...",uptempo,country,"passionate, energetic, energetic drums","electric guitar solo melody, banjo solo melody..."
1,This audio contains someone playing a big acou...,"amateur recording, acoustic drums, e-bass, ele...",slower tempo,funky,funky,"acoustic drums, electric organ"
2,A female singer sings this animated melody. Th...,"church bells, melancholic, percussion shakers,...","dance rhythm, medium tempo, groovy rhythm, ste...","animated movie soundtrack, folk music","romantic, melancholic, emotional, passionate s...","keyboard harmony, percussion shakers, church b..."
3,A male singer sings this cheerful children’s s...,"reggae music, male singer, children’s song, gr...",dance rhythm,"reggae music, reggae band, movie soundtrack, r...","exciting, cheerful",guitar accompaniment
4,A female vocalist sings this enigmatic electro...,"female vocalist, medium tempo, electronic musi...",medium tempo,"ambient, synth pop, edm, world music","charming, meditative","synthesiser articulation, synth pop"
...,...,...,...,...,...,...
813,A female vocalist sings this groovy pop song. ...,"female singer, fast tempo, lively accordion, s...",fast tempo,"teen pop, contemporary pop, pop hits",energetic,"synthesiser arpeggio, lively accordion, keyboa..."
814,This rock and roll song features a male voice ...,"rock and roll, male voice, overdriven guitar, ...",moderate tempo,rock and roll,happy mood,"percussion, overdriven guitar"
815,This music is instrumental. The tempo is mediu...,"instrumental, medium fast tempo, propulsive be...",medium fast tempo,"synth pop, dance music, techno music, house","energetic, upbeat","synthesiser arrangement, synth pop, dj mixer, ..."
816,This is a Russian lullaby piece with classical...,"russian, lullaby, classical, male vocal, mediu...",medium-to-high pitch singing,classical,"haunting, melancholic","piano, strings"


In [11]:

# Format function for instruction dataset
def formatting_func(row):
    text = f"""
    <|begin_of_text|><|start_header_id|>system<|end_header_id|>You are a music description expert that creates detailed, technical descriptions of songs based on their musical characteristics.<|eot_id|>
    <|start_header_id|>user<|end_header_id|>Generate a detailed song description based on the following tags:{row['aspect_list']}<|eot_id|>
    <|start_header_id|>assistant<|end_header_id|>{row['caption']}<|eot_id|>
    """
    return text

In [18]:
def prepare_dataset():
    dataset = load_dataset('csv', data_files={'train': "../data/musiccaps_with_concept_tags/train.csv", 'validation': "../data/musiccaps_with_concept_tags/valid.csv"})
    dataset = dataset.map(lambda row: {'text': formatting_func(row)}, remove_columns=dataset['train'].column_names)
    return dataset

In [5]:
# Configure 4-bit quantization for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,  # Double quantization for extra memory savings
    bnb_4bit_quant_type="nf4",  # NormalFloat 4-bit for better performance
    bnb_4bit_compute_dtype=torch.bfloat16  # Computation in bfloat16
)

# Load model with quantization
# model_name = "meta-llama/Llama-3.1-8B-Instruct"
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
# LoRA configuration - these are proven hyperparameters
lora_config = LoraConfig(
    r=16,  # LoRA rank - controls trainable parameters (8, 16, 32, or 64)
    lora_alpha=32,  # Scaling factor - typically 2*r
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",  # Attention layers
        "gate_proj", "up_proj", "down_proj"  # MLP layers
    ],
    lora_dropout=0.05,  # Regularization
    bias="none",
    task_type="CAUSAL_LM"
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# Print trainable parameters
model.print_trainable_parameters()
# Expected output: trainable params: ~4M (0.05%) || all params: ~8B


trainable params: 41,943,040 || all params: 7,289,966,592 || trainable%: 0.5754


In [8]:
training_args = TrainingArguments(
    per_device_train_batch_size = 8,  # Controls the batch size per device
    gradient_accumulation_steps = 2,  # Accumulates gradients to simulate a larger batch
    warmup_steps = 5,
    learning_rate = 2e-4,             # Sets the learning rate for optimization
    num_train_epochs = 3,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    optim = "adamw_8bit",
    weight_decay = 0.01,              # Regularization term for preventing overfitting
    lr_scheduler_type = "linear",     # Chooses a linear learning rate decay
    seed = 3407,                        
    output_dir = "outputs",             
    report_to = "wandb",              # Enables Weights & Biases (W&B) logging
    logging_steps = 1,                # Sets frequency of logging to W&B
    logging_strategy = "steps",       # Logs metrics at each specified step
    save_strategy = "no",               
    load_best_model_at_end = True,    # Loads the best model at the end
    save_only_model = False           # Saves entire model, not only weights
)

In [23]:
# Initialize trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=prepare_dataset()["train"],
    eval_dataset=prepare_dataset()["validation"],
    peft_config=lora_config,
    processing_class=tokenizer,
)

# Start training
trainer.train()

# Save final model
trainer.save_model()




Adding EOS to train dataset:   0%|          | 0/818 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/818 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/818 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/91 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/91 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/91 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.
[34m[1mwandb[0m: Currently logged in as: [33mofficialdespacito420[0m ([33mofficialdespacito420-politechnika-warszawska[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)


OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 7.60 GiB of which 85.12 MiB is free. Including non-PyTorch memory, this process has 6.70 GiB memory in use. Of the allocated memory 6.18 GiB is allocated by PyTorch, and 383.25 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [35]:
import evaluate

# Load metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

def evaluate_model(model, tokenizer, test_dataset):
    predictions = []
    references = []
    
    for example in test_dataset:
        print(example)
        # Generate prediction
        input_text = f"You are a music description expert that creates detailed, technical descriptions of songs based on their musical characteristics.\n{example['aspect_list']}"
        inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
        outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.7)
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(pred)
        references.append(example['caption'])
    
    # Calculate metrics
    results = {
        "bleu": bleu.compute(predictions=predictions, references=[[r] for r in references]),
        "rouge": rouge.compute(predictions=predictions, references=references),
        "bertscore": bertscore.compute(predictions=predictions, references=references, lang="en")
    }
    
    return results


In [36]:
# Evaluate the model
df = pd.read_csv("../data/musiccaps_with_concept_tags/valid.csv")
metrics = evaluate_model(model, tokenizer, df.to_dict(orient='records'))
print("Evaluation Metrics:")
for metric_name, metric_value in metrics.items():
    print(f"{metric_name}: {metric_value}")

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
Caching is incompatible with gradient checkpointing in MistralDecoderLayer. Setting `past_key_values=None`.


{'caption': 'A male vocalist sings this lilting melody. The tempo is medium with keyboard harmony, groovy bass line, steady drumming and mandolin harmony. The song is emotional, romantic, soulful, sentimental and soothing. This song is Regional Pop,', 'aspect_list': 'male singer, medium tempo, foreign language, romantic, soulful, mandolin, steady drumming, soulfulromantic, soulfu, soothing, calm, soulful, emotional, love song, folk song, world music, funky bass line, keyboard harmony, acoustic guitar', 'tempo_tags': 'medium tempo', 'genre_tags': 'soulfulromantic, world music, folk song, soulfu, funky bass line, soulful', 'mood_tags': 'romantic, soulfulromantic, emotional, soothing, funky bass line, soulful', 'instrument_tags': 'keyboard harmony, mandolin, acoustic guitar'}


  return fn(*args, **kwargs)


OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 7.60 GiB of which 128.81 MiB is free. Including non-PyTorch memory, this process has 6.70 GiB memory in use. Of the allocated memory 6.19 GiB is allocated by PyTorch, and 376.55 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)