> *Notebook was created by following [lyraaa tutorial on StableAudio Fine Tuning](https://www.youtube.com/live/ex4OBD_lrds).*

### Prepare Musicaps dataset

In [1]:
import gdown
import os
import zipfile
from typing import Dict, List, Tuple, Any

In [2]:
def unzip_file(file_path: str) -> None:
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall()

In [3]:
def preprare_musicaps_dataset(target_file_path: str, gdown_link: str) -> None:
    gdown.download(gdown_link, output=target_file_path, quiet=True)
    unzip_file(target_file_path)
    os.remove(target_file_path)

In [4]:
musicaps_gdown_link: str = "https://drive.google.com/uc?id=1FA9mzep-UkamVnk4GA_6wpgu_77Qy6c2"
output_dir: str = "musicaps.zip"

In [5]:
preprare_musicaps_dataset(
    target_file_path=output_dir,
    gdown_link=musicaps_gdown_link
)

### Prepare config files

In [6]:
os.makedirs("conf")

In [None]:
from huggingface_hub import notebook_login

# Log in to Hugging Face
notebook_login()

Remember to change line inside `model_config.json` file

```json
"sample_size": 2097152,
```
to

```json
"sample_size": 262144,
```

In [None]:
from huggingface_hub import hf_hub_download

# Download the checkpoint file
hf_hub_download(
    repo_id="stabilityai/stable-audio-open-1.0",
    filename="model.ckpt",
    local_dir="./"
)

In [None]:
# Download the model configuration file
hf_hub_download(
    repo_id="stabilityai/stable-audio-open-1.0",
    filename="model_config.json",
    local_dir="./conf"
)

In [22]:
dataset_config: str = """
{
    "dataset_type": "audio_dir",
    "datasets": [
        {
            "id": "musicaps",
            "path": "../musicaps/audio/",
            "custom_metadata_module": "../custom_metadata.py"
        }
    ],
    "random_crop": false
}
"""


with open("conf/dataset.json", "w") as f:
    f.write(dataset_config)

In [11]:
%%writefile /content/custom_metadata.py

import pandas as pd


def get_prompt(file_path: str) -> str:
    dataset_path: str
    filename: str
    dataset_path, filename = file_path.split("/[")
    dataset_path = dataset_path.replace("audio", "metadata")
    file_dataset_id: str = filename.split("]")[0]

    df: pd.DataFrame = pd.read_csv(f"{dataset_path}/musiccaps-public.csv")
    caption_value = df.loc[df['ytid'] == file_dataset_id, 'caption']
    return caption_value.iloc[0]


def get_custom_metadata(info, audio):
    prompt: str = get_prompt(info["path"])
    return {"prompt": prompt}


Writing /content/custom_metadata.py


Clone `stable-audio-tools`

In [None]:
!pip install wandb -q

In [None]:
!wandb login

In [None]:
!git clone https://github.com/Stability-AI/stable-audio-tools
%cd stable-audio-tools
!pip install -e .
%cd ..

In [None]:
!pip install protobuf==4.21.0

### Fine tune

In [24]:
%cd stable-audio-tools

/content/stable-audio-tools


In [None]:
!python3 train.py \
    --dataset-config ../conf/dataset.json \
    --model-config ../conf/model_config.json \
    --name stable_audio_open_finetune \
    --save-dir ../checkpoints \
    --checkpoint-every 1000 \
    --batch-size 32 \
    --seed 128 \
    --pretrained-ckpt-path ../model.ckpt