In [3]:
import pandas as pd
from pathlib import Path
from datasets import load_dataset
from huggingface_hub import login

In [4]:
login()

In [5]:
input_dir = Path("../outputs/caption_inference")
df_train = pd.read_csv(input_dir / "train.csv")
df_valid = pd.read_csv(input_dir / "validation.csv")
df_test = pd.read_csv(input_dir / "test.csv")
df_train.head()

Unnamed: 0,aspect_list,prediction
0,"acoustic guitar, guitar, hand claps, ethereal,...",[INST] You are a music description expert that...
1,"string section, powerful, melancholic, energet...",[INST] You are a music description expert that...
2,"bass drum, tranquil, reggae, pop, medium to up...",[INST] You are a music description expert that...
3,"dj mixer, mellow, energetic, melancholic, folk...",[INST] You are a music description expert that...
4,"string section, saxophone, meditative, edm, me...",[INST] You are a music description expert that...


In [6]:
# Remove instructions in [INST] and [/INST] from captions
def clean_caption(caption):
    if "[INST]" in caption and "[/INST]" in caption:
        caption = caption.split("[/INST]")[1].strip()
    return caption

In [7]:
df_train["caption"] = df_train["prediction"].apply(clean_caption)
df_valid["caption"] = df_valid["prediction"].apply(clean_caption)
df_test["caption"] = df_test["prediction"].apply(clean_caption)
df_train = df_train.drop(columns=["prediction"])
df_valid = df_valid.drop(columns=["prediction"])
df_test = df_test.drop(columns=["prediction"])
df_train.head()

Unnamed: 0,aspect_list,caption
0,"acoustic guitar, guitar, hand claps, ethereal,...",This song features an acoustic guitar playing ...
1,"string section, powerful, melancholic, energet...",This song features a powerful and energetic st...
2,"bass drum, tranquil, reggae, pop, medium to up...",This song is a Reggae-Pop fusion with a modera...
3,"dj mixer, mellow, energetic, melancholic, folk...","This song is a mellow, energetic, melancholic,..."
4,"string section, saxophone, meditative, edm, me...",This song begins with a meditative string sect...


In [8]:
# Create output directory
output_dir = Path("../data/generated_captions")
output_dir.mkdir(parents=True, exist_ok=True)

df_train.to_csv(output_dir / "train.csv", index=False)
df_valid.to_csv(output_dir / "validation.csv", index=False)
df_test.to_csv(output_dir / "test.csv", index=False)
all_df = pd.concat([df_train, df_valid, df_test])
all_df.to_csv(output_dir / "all.csv", index=False)

In [9]:
data_files = {
    "train": str(output_dir / "train.csv"),
    "validation": str(output_dir / "validation.csv"),
    "test": str(output_dir / "test.csv")
}
dataset = load_dataset("csv", data_files=data_files)
dataset.push_to_hub("bsienkiewicz/random-tags-to-caption", private=True)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/datasets/bsienkiewicz/random-tags-to-caption/commit/6beabdece803fd873dc8cdbcabe45842e1787517', commit_message='Upload dataset', commit_description='', oid='6beabdece803fd873dc8cdbcabe45842e1787517', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/bsienkiewicz/random-tags-to-caption', endpoint='https://huggingface.co', repo_type='dataset', repo_id='bsienkiewicz/random-tags-to-caption'), pr_revision=None, pr_num=None)