# Fine tune large language model

Within this challenge, we are tasked with the development of AI systems capable of receiving textual descriptions as input and generating high-quality audio wave files as output. These AI systems will craft customized background music, considering various elements such as melody, hits, styles, and more to evoke the intended emotional and contextual resonance.

Using example from this website:
https://docs.ray.io/en/latest/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.html

# Installation and Setup Environment

In [None]:
%%capture
!pip3 install ray
!pip3 install datasets
!pip3 install transformer
!pip3 install numpy datasets "transformers>=4.19.1" "pytorch_lightning>=1.6.5"
!pip3 install accelerate
!pip3 install lightning
!pip3 install deepspeed
!pip3 install fad_pytorch

In [5]:
import ray
import re
import ray
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
from datasets import concatenate_datasets, load_dataset

NUM_WORKERS = 2
BATCH_SIZE_PER_WORKER = 8
MODEL_NAME = "Helsinki-NLP/opus-mt-en-fr"

In [43]:
ray.shutdown()
ray.init(
    runtime_env={
        "pip": [
            "datasets==2.13.1",
            "torch>=1.13.0",
            "deepspeed==0.9.4",
            "accelerate>=0.20.3",
            "transformers==4.30.2",
            "lightning==2.0.3",
        ],
    },
    ignore_reinit_error=True,
)

2023-11-12 17:47:44,876	INFO worker.py:1673 -- Started a local Ray instance.


0,1
Python version:,3.11.1
Ray version:,2.8.0


# Get Dataset and preprocessing

get dataset from json file

In [16]:
from datasets import Dataset, DatasetDict

# Create a list of dictionaries where each dictionary represents a sample in the dataset
with open("test_train.json", 'r') as json_file:
    data = json.load(json_file)

# reformat data
dict_dataset = {"description":[], "audio": []}
for info in data.values():
  dict_dataset['description'].append(info['description'])
  dict_dataset['audio'].append(info['audio'])

# Create a datasets.Dataset instance
# You can also specify additional metadata such as features and split
my_dataset = Dataset.from_dict(dict_dataset, split='train')

# Save the dataset to a file (optional)
# dataset_dict.save_to_disk("my_dataset")

# Load the dataset from the saved file
# loaded_dataset = DatasetDict.load_from_disk("my_dataset")


In [17]:
my_dataset

Dataset({
    features: ['description', 'audio'],
    num_rows: 20
})

# preprocess input and label to appropriate format

In [6]:
from miditok import REMI, TokenizerConfig  # here we choose to use REMI
from pathlib import Path

# Our parameters
TOKENIZER_PARAMS = {
    "pitch_range": (21, 109),
    "beat_res": {(0, 4): 8, (4, 12): 4},
    "nb_velocities": 32,
    "special_tokens": ["PAD", "BOS", "EOS", "MASK"],
    "use_chords": True,
    "use_rests": False,
    "use_tempos": True,
    "use_time_signatures": False,
    "use_programs": False,
    "nb_tempos": 32,  # nb of tempo bins
    "tempo_range": (40, 250),  # (min, max)
}
config = TokenizerConfig(**TOKENIZER_PARAMS)

In [8]:
# TODO: include the tokenizer for input and output, and code to get url
from miditoolkit import MidiFile
from basic_pitch.inference import predict, predict_and_save
from basic_pitch import ICASSP_2022_MODEL_PATH
import os
tokenizer_description = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer_music = REMI(config)
midi_paths = "audio/mid/"
mp3_paths = "audio/mp3/"


def preprocess_description_music(description: str, music: str):
    '''
    function to preprocess input based on input and output tokenizer
    args:
      - description(str): string need to encode (description in this case)
      - music(str): music file, taken from url, get from storage, needs to convert url to MIDI file

    returns:
      - input_preprocess: list of integer
      - output_preprocess: list of integer
    '''
    description_token = tokenizer_description.tokenize(description)
    description_preprocess = tokenizer_description.convert_tokens_to_ids(
        description_token)
    base_name = music.replace("audio/mp3/", "").replace(".mp3", "")
    # TODO: include code to get data from url, change it to appropriate format, and get preprocess output

    def toMidi(music):
        if not os.path.exists(midi_paths + base_name + "_basic_pitch.mid"):
            
            predict_and_save(Path(mp3_paths).glob(base_name+".mp3"), Path(midi_paths), save_midi=True,
                          sonify_midi=False, model_path=ICASSP_2022_MODEL_PATH, save_model_outputs=False, save_notes=False)
    toMidi(music)

    midiFile = MidiFile(midi_paths + base_name + "_basic_pitch.mid")
    tokens = tokenizer_music(midiFile)
    music_preprocess = tokens[0].ids
    return {"input_ids": description_preprocess, "labels": music_preprocess}

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]



# Process dataset

In [19]:
processed_dataset = my_dataset.map(
    lambda example: preprocess_description_music(
        example['description'], example['audio']),
    remove_columns=['description', 'audio'],  # Remove the original columns
)
processed_dataset = processed_dataset.train_test_split(train_size=0.8, seed=20)
processed_dataset["validation"] = processed_dataset.pop("test")

Map:   0%|          | 0/20 [00:00<?, ? examples/s]


Predicting MIDI for audio/mp3/1699168556.1432111.mp3...


  Creating midi...
  💅 Saved to audio/mid/1699168556.1432111_basic_pitch.mid

Predicting MIDI for audio/mp3/1699168565.7955616.mp3...


  Creating midi...
  💅 Saved to audio/mid/1699168565.7955616_basic_pitch.mid

Predicting MIDI for audio/mp3/1699168589.6105175.mp3...


  Creating midi...
  💅 Saved to audio/mid/1699168589.6105175_basic_pitch.mid

Predicting MIDI for audio/mp3/1699168567.8103771.mp3...


  Creating midi...
  💅 Saved to audio/mid/1699168567.8103771_basic_pitch.mid

Predicting MIDI for audio/mp3/1699168575.4547818.mp3...


  Creating midi...
  💅 Saved to audio/mid/1699168575.4547818_basic_pitch.mid

Predicting MIDI for audio/mp3/1699168583.339253.mp3...


  Creating midi...
  💅 Saved to audio/mid/1699168583.339253_basic_pitch.mid

Predicting MIDI for audio/mp3/1699168581.0166345.mp3...


  Creating midi...
  💅 Saved to audio/mid/1699168581.0166345_basic_pitch.mid

Predicting MIDI for audio/mp3/1699168579.4417906.

# Model

In [41]:
from transformers import AutoModelForSeq2SeqLM

# get pretrain model 
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

 define a collate function that will apply the correct amount of padding to the items of the dataset we want to batch together.

In [42]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer_description, model=model)

create dataloader

In [21]:
from torch.utils.data import DataLoader

# Training DataLoader
train_dataloader = DataLoader(
    processed_dataset["train"],            # Training dataset
    shuffle=True,                          # Shuffle the data at each epoch
    batch_size=4,                          # Number of samples in each batch
    collate_fn=data_collator,              # Collate function to process batches
    num_workers=4,                         # Number of subprocesses to use for data loading
    pin_memory=True                        # Pin memory for faster data transfer to GPU
)

# Evaluation DataLoader
eval_dataloader = DataLoader(
    processed_dataset["validation"],       # Validation dataset
    batch_size=4,                          # Number of samples in each batch
    collate_fn=data_collator,              # Collate function to process batches
    num_workers=4,                         # Number of subprocesses to use for data loading
    pin_memory=True                        # Pin memory for faster data transfer to GPU
)


# Evaluation

https://github.com/msight-tech/research-fad
https://github.com/LAION-AI/CLAP

# Training

In [43]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="result",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
)

trainer = Trainer(
    model=model,
    args=args,
    data_collator=data_collator,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["validation"],
)

In [44]:
trainer.train()

  0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 

Another code for training part

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
            )
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        generated_tokens = accelerator.pad_across_processes(
            generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
        )
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(generated_tokens)
        labels_gathered = accelerator.gather(labels)

        decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)

# Demonstration

In [None]:
from transformers import AutoModelForSeq2SeqLM

# get pretrain model 
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Strach (for testing code)

In [2]:
import librosa
from IPython.display import Audio, display, clear_output
import ipywidgets as widgets

# Get the file path to an included audio example
filename = "we_wish_you.mp3"

# Load the audio as a waveform `y`
# Store the sampling rate as `sr`
y, sr = librosa.load(filename)

# Run the default beat tracker
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)

# Convert the frame indices of beat events into timestamps
beat_times = librosa.frames_to_time(beat_frames, sr=sr)

# Function to play audio when the button is clicked
clear_output(wait=True)  # Clear the previous output, if any
display(Audio(y, rate=sr))  # Play the audio

Estimated tempo: 184.57 beats per minute


https://www.analyticsvidhya.com/blog/2023/09/text-to-sound-train-your-large-language-models/

In [7]:
from miditok import REMI, TokenizerConfig  # here we choose to use REMI

# Our parameters
TOKENIZER_PARAMS = {
    "pitch_range": (21, 109),
    "beat_res": {(0, 4): 8, (4, 12): 4},
    "nb_velocities": 32,
    "special_tokens": ["PAD", "BOS", "EOS", "MASK"],
    "use_chords": True,
    "use_rests": False,
    "use_tempos": True,
    "use_time_signatures": False,
    "use_programs": False,
    "nb_tempos": 32,  # nb of tempo bins
    "tempo_range": (40, 250),  # (min, max)
}
config = TokenizerConfig(**TOKENIZER_PARAMS)

# Creates the tokenizer
tokenizer = REMI(config)

In [28]:
from miditoolkit import MidiFile

# Tokenize a MIDI file
midi = MidiFile("/Users/khoavo2003/Documents/GitHub/muze/midi_dataset/we_wish_you.mid")
tokens = tokenizer(midi)  # automatically detects MidiFile, paths

In [32]:
tokens[0].ids

[4,
 200,
 58,
 105,
 127,
 206,
 15,
 111,
 143,
 27,
 109,
 133,
 63,
 105,
 129,
 211,
 63,
 108,
 127,
 216,
 63,
 106,
 127,
 219,
 62,
 107,
 127,
 4,
 189,
 60,
 105,
 130,
 191,
 20,
 112,
 138,
 193,
 8,
 109,
 134,
 195,
 60,
 108,
 129,
 200,
 60,
 106,
 134,
 205,
 65,
 106,
 130,
 206,
 17,
 114,
 139,
 211,
 65,
 108,
 126,
 214,
 60,
 105,
 128,
 67,
 111,
 126,
 219,
 63,
 107,
 126,
 4,
 189,
 22,
 112,
 140,
 62,
 109,
 128,
 192,
 10,
 109,
 136,
 195,
 58,
 113,
 129,
 200,
 58,
 109,
 137,
 205,
 19,
 111,
 140,
 206,
 67,
 110,
 129,
 210,
 7,
 107,
 133,
 211,
 67,
 110,
 126,
 214,
 68,
 106,
 128,
 218,
 12,
 102,
 127,
 219,
 65,
 107,
 126,
 4,
 189,
 12,
 112,
 144,
 63,
 105,
 129,
 190,
 24,
 113,
 143,
 194,
 60,
 110,
 130,
 200,
 58,
 112,
 127,
 203,
 58,
 109,
 130,
 205,
 17,
 115,
 135,
 60,
 105,
 134,
 211,
 65,
 108,
 129,
 212,
 58,
 103,
 131,
 216,
 22,
 111,
 129,
 62,
 106,
 127,
 4,
 189,
 15,
 112,
 139,
 190,
 27,
 108,
 138,
 201,
 39,
 

In [33]:
real_tokens = tokenizer._ids_to_tokens(tokens[0].ids)

In [35]:
out_midi = tokenizer([real_tokens])

In [37]:
out_midi.dump('we_wish_you123.mid')

In [None]:
from miditok import REMI, TokSequence
from copy import deepcopy

tokenizer = REMI()  # using defaults parameters (constants.py)
tokens_no_bpe_paths = list(Path('path', 'to', 'dataset').glob('**/*.json'))


In [26]:
tokens

[TokSequence(tokens=['Bar_None', 'Position_0', 'Tempo_121.29', 'Position_11', 'Pitch_74', 'Velocity_51', 'Duration_0.3.8', 'Position_17', 'Pitch_31', 'Velocity_75', 'Duration_2.3.8', 'Pitch_43', 'Velocity_67', 'Duration_1.1.8', 'Pitch_79', 'Velocity_51', 'Duration_0.5.8', 'Position_22', 'Pitch_79', 'Velocity_63', 'Duration_0.3.8', 'Position_27', 'Pitch_79', 'Velocity_55', 'Duration_0.3.8', 'Position_30', 'Pitch_78', 'Velocity_59', 'Duration_0.3.8', 'Bar_None', 'Position_0', 'Pitch_76', 'Velocity_51', 'Duration_0.6.8', 'Position_2', 'Pitch_36', 'Velocity_79', 'Duration_1.6.8', 'Position_4', 'Pitch_24', 'Velocity_67', 'Duration_1.2.8', 'Position_6', 'Pitch_76', 'Velocity_63', 'Duration_0.5.8', 'Position_11', 'Pitch_76', 'Velocity_55', 'Duration_1.2.8', 'Position_16', 'Pitch_81', 'Velocity_55', 'Duration_0.6.8', 'Position_17', 'Pitch_33', 'Velocity_87', 'Duration_1.7.8', 'Position_22', 'Pitch_81', 'Velocity_63', 'Duration_0.2.8', 'Position_25', 'Pitch_76', 'Velocity_51', 'Duration_0.4.8',

In [26]:
from transformers import GPT2Tokenizer, GPT2Model
import torch

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [27]:
inputs

{'input_ids': tensor([[15496,    11,   616,  3290,   318, 13779]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [28]:
outputs = model(**inputs)

last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

In [33]:
outputs

BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[-9.4714e-06, -1.4021e-01, -2.0845e-01,  ..., -1.5329e-01,
          -6.7827e-02, -1.9630e-01],
         [ 4.1949e-01,  2.3525e-01,  3.4816e-01,  ...,  4.5321e-02,
           1.5447e-01,  1.9547e-02],
         [-7.0056e-02,  2.6082e-01, -2.9146e-01,  ...,  9.0978e-02,
           4.9659e-01, -4.1824e-01],
         [-1.9695e-01, -2.9247e-01, -1.4120e-01,  ..., -8.9255e-02,
          -2.2392e-01,  1.2212e-01],
         [-6.4193e-01, -1.0236e-01, -4.2129e-01,  ...,  6.8696e-02,
          -5.1117e-01,  5.0044e-01],
         [ 4.1286e-03, -3.1455e-02, -1.0823e+00,  ..., -5.0159e-02,
          -3.0879e-02,  4.3480e-01]]], grad_fn=<ViewBackward0>), past_key_values=((tensor([[[[-1.2526,  2.3200,  0.1722,  ..., -1.0076, -0.1897,  1.3219],
          [-1.6482,  3.0222,  1.2789,  ..., -0.9078, -1.7395,  2.4237],
          [-1.8892,  2.4222,  2.5229,  ..., -1.4062, -1.9514,  1.7598],
          [-1.4859,  3.7323,  1.5158,  ..., -1.4

In [37]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

inputs = tokenizer("Hello", return_tensors="pt")
outputs = model(**inputs, labels=inputs["input_ids"])
loss, logits = outputs[:2]

In [48]:
inputs

{'input_ids': tensor([[15496]]), 'attention_mask': tensor([[1]])}

In [50]:
model.device

device(type='mps', index=0)

In [59]:
inputs['input_ids'] = inputs['input_ids'].to('mps')

In [60]:
inputs['attention_mask'] = inputs['attention_mask'].to('mps')

In [61]:
inputs

{'input_ids': tensor([[15496]], device='mps:0'), 'attention_mask': tensor([[1]], device='mps:0')}

In [63]:
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from datasets import load_dataset

processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")

dataset = load_dataset("sanchit-gandhi/gtzan", split="train", streaming=True)
sample = next(iter(dataset))["audio"]

# take the first quarter of the audio sample
sample_1 = sample["array"][: len(sample["array"]) // 4]

# take the first half of the audio sample
sample_2 = sample["array"][: len(sample["array"]) // 2]

inputs = processor(
    audio=[sample_1, sample_2],
    sampling_rate=sample["sampling_rate"],
    text=["80s blues track with groovy saxophone", "90s rock song with loud guitars and heavy drums"],
    padding=True,
    return_tensors="pt",
)
audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=256)

# post-process to remove padding from the batched audio
audio_values = processor.batch_decode(audio_values, padding_mask=inputs.padding_mask)

Downloading (…)rocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/7.87k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/2.36G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/703 [00:00<?, ?B/s]

In [65]:
for sample in dataset:
  pass

In [66]:
dataset

<datasets.iterable_dataset.IterableDataset at 0x32e30e410>

In [69]:
sample

{'file': '/home/sanchit/.cache/datasets/downloads/extracted/f729783d70a4541cc4c9d5649655490a9c660280bdbecddfe38a8a806c73f60e/genres/rock/rock.00099.wav',
 'audio': {'path': 'rock.00099.wav',
  'array': array([-0.02034685, -0.03268729, -0.03488842, ...,  0.0017763 ,
         -0.0013658 , -0.00144724]),
  'sampling_rate': 32000},
 'genre': 9}

In [84]:
inputs = processor(
    sampling_rate=sample['audio']["sampling_rate"],
    text=["The recording features a mellow piano melody, synth pad chords and sustained strings melody. It sounds emotional, passionate and the recording is noisy", ""],
    padding=True,
    return_tensors="pt",
)

In [85]:
audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=256)

# post-process to remove padding from the batched audio
audio_values = processor.batch_decode(audio_values)

AttributeError: 

In [86]:
# post-process to remove padding from the batched audio
audio_values = processor.batch_decode(audio_values)

In [81]:
audio_values[0]

array([[ 0.0044059 ,  0.00513742,  0.0054669 , ..., -0.04087205,
        -0.04015592, -0.03886146]])

In [87]:
from IPython.display import Audio

sampling_rate = model.config.audio_encoder.sampling_rate
Audio(np.array(audio_values[0]), rate=sampling_rate)

In [78]:
import numpy as np