# Fine tune large language model

Within this challenge, we are tasked with the development of AI systems capable of receiving textual descriptions as input and generating high-quality audio wave files as output. These AI systems will craft customized background music, considering various elements such as melody, hits, styles, and more to evoke the intended emotional and contextual resonance.

Using example from this website:
https://docs.ray.io/en/latest/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.html

# Installation and Setup Environment

In [1]:
%%capture
!pip3 install ray
!pip3 install datasets
!pip3 install transformer
!pip3 install numpy datasets "transformers>=4.19.1" "pytorch_lightning>=1.6.5"
!pip3 install accelerate
!pip3 install lightning
!pip3 install deepspeed

!pip install miditok
!pip install miditoolkit
!pip install torch
!pip install torchtoolkit

from pathlib import Path
from copy import deepcopy

from torch import Tensor, argmax
from torch.utils.data import DataLoader
from torch.cuda import is_available as cuda_available, is_bf16_supported
from torch.backends.mps import is_available as mps_available
from torchtoolkit.data import create_subsets
from transformers import GPT2LMHeadModel, GPT2Config, Trainer, TrainingArguments, GenerationConfig
from evaluate import load as load_metric
from miditok import REMI, TokenizerConfig
from miditok.pytorch_data import DatasetTok, DataCollator
from tqdm import tqdm

In [2]:
import ray
import re
import ray
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
from datasets import concatenate_datasets, load_dataset

NUM_WORKERS = 2
BATCH_SIZE_PER_WORKER = 8
MODEL_NAME = "gpt2"

2023-11-11 16:02:05,986	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [None]:
ray.shutdown()
ray.init(
    runtime_env={
        "pip": [
            "datasets==2.13.1",
            "torch>=1.13.0",
            "deepspeed==0.9.4",
            "accelerate>=0.20.3",
            "transformers==4.30.2",
            "lightning==2.0.3",
        ],
    },
    ignore_reinit_error=True,
)

# Get Dataset and preprocessing

get dataset from json file

In [None]:
from datasets import Dataset, DatasetDict

# Create a list of dictionaries where each dictionary represents a sample in the dataset
with open("/Users/khoavo2003/Documents/GitHub/muze/new_train.json", 'r') as json_file:
    data = json.load(json_file)

# reformat data
dict_dataset = {"description":[], "url": []}
for info in data.values():
  dict_dataset['description'].append(info['description'])
  dict_dataset['url'].append(info['url'])

# Create a datasets.Dataset instance
# You can also specify additional metadata such as features and split
my_dataset = Dataset.from_dict(dict_dataset, split='train')

# Save the dataset to a file (optional)
# dataset_dict.save_to_disk("my_dataset")

# Load the dataset from the saved file
# loaded_dataset = DatasetDict.load_from_disk("my_dataset")


In [27]:
my_dataset

Dataset({
    features: ['description', 'url'],
    num_rows: 10000
})

preprocess input and label to appropriate format

MidiTok config

In [5]:
# tokenizer's configuration
PITCH_RANGE = (21, 109)
BEAT_RES = {(0, 1): 8, (1, 2): 4, (2, 4): 2, (4, 8): 1}
NB_VELOCITIES = 24
SPECIAL_TOKENS = ["PAD", "MASK", "BOS", "EOS"]
USE_CHORDS = False
USE_RESTS = False
USE_TEMPOS = True
USE_TIME_SIGNATURE = False
USE_PROGRAMS = False
NB_TEMPOS = 32
TEMPO_RANGE = (50, 200)  # (min_tempo, max_tempo)
TOKENIZER_PARAMS = {
    "pitch_range": PITCH_RANGE,
    "beat_res": BEAT_RES,
    "nb_velocities": NB_VELOCITIES,
    "special_tokens": SPECIAL_TOKENS,
    "use_chords": USE_CHORDS,
    "use_rests": USE_RESTS,
    "use_tempos": USE_TEMPOS,
    "use_time_signatures": USE_TIME_SIGNATURE,
    "use_programs": USE_PROGRAMS,
    "nb_tempos": NB_TEMPOS,
    "tempo_range": TEMPO_RANGE,
}
config = TokenizerConfig(**TOKENIZER_PARAMS)

tokenizer = REMI(config)  # REMI tokenizer
tokenizer.tokenize_midi_dataset('we_wish_you.mid', 'tmp/')

Tokenizing MIDIs (tmp): 0it [00:00, ?it/s]


In [10]:
from miditoolkit import MidiFile
midi = MidiFile('we_wish_you.mid')
tokens = tokenizer(midi)

In [6]:
tokenizer.save_params(out_path='tmp/', filename='tokenizer_config.json')

In [8]:
tokenizer = REMI(params=Path('tmp/tokenizer_config.json'))

In [None]:
#TODO: include the tokenizer for input and output, and code to get url
tokenizer_input = AutoTokenizer.from_pretrained('gpt2')
tokenizer_output = AutoTokenizer.from_pretrained
def preprocess_input_output(input: str, output: str):
  '''
  function to preprocess input based on input and output tokenizer
  args:
    - input(str): string need to encode (description in this case)
    - output(str): url to the music
  '''
  input_token = tokenizer_input.tokenize(input)
  input_preprocess = tokenizer_input.convert_tokens_to_ids(input_token)
  #TODO: include code to get data from url, change it to appropriate format, and get preprocess output
  output_token = tokenizer_output.tokenize(output)
  output_preprocess = tokenizer_output.convert_tokens_to_ids(output_token)
  return input_preprocess, output_preprocess

In [None]:
processed_dataset = my_dataset.map(
    lambda example: preprocess_input_output(example['description'], example['url']),
    batched=True,
    remove_columns=['description', 'url'],  # Remove the original columns
)
# TODO: Rename columns to 'input_ids' and 'labels'
processed_dataset = processed_dataset.rename_column("0", "input_ids")
processed_dataset = processed_dataset.rename_column("1", "labels")

In [None]:
processed_dataset = processed_dataset.train_test_split(train_size=0.8, seed=20)
processed_dataset["validation"] = processed_dataset.pop("test")

split into train and evaluation set

# Preprocessing

change audio data into format that can be used to train and evaluate the model 

tokenize music 


# Model

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

 define a collate function that will apply the correct amount of padding to the items of the dataset we want to batch together.

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer_input, model=model)

In [None]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"marian-finetuned-kde4-en-to-fr",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
)

create dataloader

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    processed_dataset["train"], shuffle=True, batch_size=8, collate_fn=data_collator, num_workers=4, pin_memory=True
)
eval_dataloader = DataLoader(
    processed_dataset["validation"], batch_size=8, collate_fn=data_collator, num_workers=4, pin_memory=True
)

# DeepSpeed Configurations

define hyperparameter

# Training

training loop

In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="result",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=args,
    data_collator=data_collator,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["valid"],
)

In [None]:
trainer.train()

# Demonstration

In [None]:
import torch
from transformers import pipeline

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
    "text-generation", model="huggingface-course/codeparrot-ds", device=device
)

In [None]:
import torch
import ray
import lightning.pytorch as pl
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
from accelerate import (
    init_empty_weights,
    infer_auto_device_map,
    load_checkpoint_and_dispatch,
)

# Initialize a model on meta device
with init_empty_weights():
    config = AutoConfig.from_pretrained(MODEL_NAME)
    meta_model = AutoModelForCausalLM.from_config(config)
meta_model.tie_weights()

# Define the device mapping
device_map = infer_auto_device_map(
    meta_model,
    max_memory={0: "15GB", "cpu": "60GB"},
    no_split_module_classes=["LlamaDecoderLayer"],
)

# Load the model parameters
model = load_checkpoint_and_dispatch(
    meta_model,
    checkpoint=full_model_ckpt_path,
    device_map=device_map,
)

In [None]:
from transformers import pipeline

generator = pipeline(
    "text-generation",
    model=model,
    device_map=device_map,
    tokenizer=AutoTokenizer.from_pretrained(
        MODEL_NAME, padding_side="left", use_fast=False
    ),
)

# Strach (for testing code)

In [2]:
import librosa
from IPython.display import Audio, display, clear_output
import ipywidgets as widgets

# Get the file path to an included audio example
filename = "we_wish_you.mp3"

# Load the audio as a waveform `y`
# Store the sampling rate as `sr`
y, sr = librosa.load(filename)

# Run the default beat tracker
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)

# Convert the frame indices of beat events into timestamps
beat_times = librosa.frames_to_time(beat_frames, sr=sr)

# Function to play audio when the button is clicked
clear_output(wait=True)  # Clear the previous output, if any
display(Audio(y, rate=sr))  # Play the audio

Estimated tempo: 184.57 beats per minute


https://www.analyticsvidhya.com/blog/2023/09/text-to-sound-train-your-large-language-models/