# MedASR Dataset Generation

This book generated the audio files used to fine tune MedASR for military, medical style language. Notebook will prompt user to add a CSV file with statements, and background audio noises. <br>
For this notebook, I added backgound noises of helicopters and radio static at various nosie levels.

In [None]:

!apt-get install -y espeak-ng
!pip install piper-tts datasets transformers soundfile librosa tqdm huggingface_hub


In [None]:
!wget https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/ryan/high/en_US-ryan-high.onnx
!wget https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/ryan/high/en_US-ryan-high.onnx.json

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd

df = pd.read_csv("medasr_prompts.csv")
print("Total samples:", len(df))
df.head()

In [None]:
import os
import subprocess
from tqdm import tqdm
import pandas as pd

os.makedirs("medasr_dataset/audio", exist_ok=True)

metadata = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row[1] # this might need to be 1 or "text"
    out_path = f"medasr_dataset/audio/{i:05d}.wav"

    cmd = f'echo "{text}" | piper --model en_US-ryan-high.onnx --output_file {out_path}'
    subprocess.call(cmd, shell=True)

    metadata.append({
        "audio": out_path,
        "text": text
    })

pd.DataFrame(metadata).to_csv("medasr_dataset/metadata.csv", index=False)
print("Clean dataset built with Piper (Ryan).")

  text = row[1] # this might need to be 1 or "text"
100%|██████████| 1260/1260 [7:50:19<00:00, 22.40s/it]

Clean dataset built with Piper (Ryan).





In [None]:
uploaded = files.upload()

Saving helicopter.wav to helicopter.wav
Saving radio_static.wav to radio_static.wav
Saving copter_pass.wav to copter_pass.wav


In [None]:
import librosa
import numpy as np
import soundfile as sf
import random
import os
from tqdm import tqdm

# ✅ Battlefield noise files
noise_files = [
    "helicopter.wav",
    "radio_static.wav",
    "copter_pass.wav"
]

# ✅ Function to mix noise into clean audio at a given SNR
def mix_noise(clean, noise, snr_db):
    clean_power = np.mean(clean**2)
    noise_power = np.mean(noise**2)
    scale = np.sqrt(clean_power / (10**(snr_db/10) * noise_power))
    return clean + noise * scale

# ✅ Make sure the audio folder exists
audio_folder = "medasr_dataset/audio"
os.makedirs(audio_folder, exist_ok=True)

# ✅ Process audio by CSV index to maintain alignment
for i, row in tqdm(df.iterrows(), total=len(df)):
    audio_path = os.path.join(audio_folder, f"{i:05d}.wav")

    # Check if file exists
    if not os.path.exists(audio_path):
        print(f"⚠️ Skipping {audio_path}, file not found")
        continue

    # Load clean audio
    try:
        clean, sr = librosa.load(audio_path, sr=None)
    except Exception as e:
        print(f"⚠️ Skipping {audio_path}, error: {e}")
        continue

    # Choose random noise
    noise_path = random.choice(noise_files)
    noise, _ = librosa.load(noise_path, sr=sr)

    # Repeat noise if too short
    if len(noise) < len(clean):
        noise = np.tile(noise, int(np.ceil(len(clean)/len(noise))))
    noise = noise[:len(clean)]

    # Random SNR between 5–15 dB
    snr = random.uniform(5, 15)
    mixed = mix_noise(clean, noise, snr)

    # Overwrite original audio with noisy version
    sf.write(audio_path, mixed, sr)

print("✅ Battlefield noise added safely. All audio files remain aligned with CSV.")

100%|██████████| 1260/1260 [02:20<00:00,  8.96it/s]

✅ Battlefield noise added safely. All audio files remain aligned with CSV.





In [None]:
# run
import os
import shutil

audio_files = sorted(os.listdir("medasr_dataset/audio"))
for new_idx, file in enumerate(audio_files):
    old_path = os.path.join("medasr_dataset/audio", file)
    new_path = os.path.join("medasr_dataset/audio", f"{new_idx:05d}.wav")
    os.rename(old_path, new_path)

# Update CSV paths to match new filenames
df["audio"] = [f"medasr_dataset/audio/{i:05d}.wav" for i in range(len(df))]
df.to_csv("medasr_dataset/metadata.csv", index=False)

In [None]:
import random
import IPython.display as ipd
import librosa

# List all audio files in your dataset folder
audio_files = [f"medasr_dataset/audio/{i:05d}.wav" for i in range(len(df))]

# Pick a few random samples to listen to (3 here)
samples_to_play = random.sample(audio_files, min(3, len(audio_files)))

print("Playing a few random samples from the dataset:")

for fpath in samples_to_play:
    print(f"▶️ {fpath}")
    y, sr = librosa.load(fpath, sr=None)   # load audio with original sampling rate
    display(ipd.Audio(y, rate=sr))         # play in Colab


Playing a few random samples from the dataset:
▶️ medasr_dataset/audio/00015.wav


▶️ medasr_dataset/audio/00056.wav


▶️ medasr_dataset/audio/00731.wav


In [None]:
# run
from datasets import Dataset, Audio

dataset = Dataset.from_csv("medasr_dataset/metadata.csv")
dataset = dataset.cast_column("audio", Audio())
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['id', ' text', 'audio'],
    num_rows: 1260
})

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!ls medasr_dataset/audio | head

In [None]:
# cleans and corrects format, run

import pandas as pd

# Load without trusting header
df_text = pd.read_csv(
    "medasr_prompts.csv",
    header=None,
    skiprows=1,        # skip the broken "Id, text" line
    names=["text"],   # force correct column name
)

# Clean weird smart quotes
df_text["text"] = df_text["text"].str.replace("“", "").str.replace("”", "").str.strip()

print(df_text.head())
print("Total prompts:", len(df_text))

                                                  text
801  AI, this is Echo-26 medic in TFC, casualty wit...
802  AI, this is Falcon-38 medic at CCP, casualty w...
803  AI, this is Ranger-38 medic in TFC, casualty w...
804  AI, this is Sentinel-31 medic at CCP, casualty...
805  AI, this is Wolf-11 medic in TFC, casualty wit...
Total prompts: 1260


In [None]:
# corrects formatting, run

import glob
import os

audio_files = sorted(glob.glob("medasr_dataset/audio/*.wav"))

print("Audio files found:", len(audio_files))
print("First 5 audio files:", audio_files[:5])

assert len(audio_files) <= len(df_text), "More audio than text rows – something is wrong."

metadata = []
for i, audio_path in enumerate(audio_files):
    metadata.append({
        "audio": os.path.abspath(audio_path),
        "text": df_text.iloc[i]["text"]
    })

metadata_df = pd.DataFrame(metadata)
metadata_df.to_csv("medasr_dataset/metadata.csv", index=False)

print("metadata.csv rebuilt successfully.")

Audio files found: 1260
First 5 audio files: ['medasr_dataset/audio/00000.wav', 'medasr_dataset/audio/00001.wav', 'medasr_dataset/audio/00002.wav', 'medasr_dataset/audio/00003.wav', 'medasr_dataset/audio/00004.wav']
metadata.csv rebuilt successfully.


In [None]:
#checking data, run

df = pd.read_csv("medasr_dataset/metadata.csv")
print(df.head())
print("File exists:", os.path.exists(df["audio"].iloc[0]))

                                     audio  \
0  /content/medasr_dataset/audio/00000.wav   
1  /content/medasr_dataset/audio/00001.wav   
2  /content/medasr_dataset/audio/00002.wav   
3  /content/medasr_dataset/audio/00003.wav   
4  /content/medasr_dataset/audio/00004.wav   

                                                text  
0  AI, this is Echo-26 medic in TFC, casualty wit...  
1  AI, this is Falcon-38 medic at CCP, casualty w...  
2  AI, this is Ranger-38 medic in TFC, casualty w...  
3  AI, this is Sentinel-31 medic at CCP, casualty...  
4  AI, this is Wolf-11 medic in TFC, casualty wit...  
File exists: True


In [None]:
# last step before upload, run

from datasets import Dataset, Audio

dataset = Dataset.from_csv("medasr_dataset/metadata.csv")
dataset = dataset.cast_column("audio", Audio())  # ensures HF knows audio paths
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['audio', 'text'],
    num_rows: 1260
})

In [None]:
# run this last for upload

dataset.push_to_hub("CharlieKingOfTheRats/medasr-military-1300")

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ? shards/s]

Map:   0%|          | 0/630 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              :   0%|          |  526kB /  485MB            

Map:   0%|          | 0/630 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              :   1%|          | 3.68MB /  510MB            

CommitInfo(commit_url='https://huggingface.co/datasets/CharlieKingOfTheRats/medasr-military-1300/commit/aa7e63b70d6dee715260cdb120f6522693f35794', commit_message='Upload dataset', commit_description='', oid='aa7e63b70d6dee715260cdb120f6522693f35794', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/CharlieKingOfTheRats/medasr-military-1300', endpoint='https://huggingface.co', repo_type='dataset', repo_id='CharlieKingOfTheRats/medasr-military-1300'), pr_revision=None, pr_num=None)