In [9]:
from pathlib import Path
import random
import json
import pandas as pd
import os

In [10]:
# Reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)

# Project root (folder that contains "data/")
PROJECT_ROOT = Path(r"C:\Users\tengc\Downloads\metrics-eval\metrics_eval")

# Input: existing CS scoring samples
CS_SCORING_JSONL = PROJECT_ROOT / "data" / "cs_scoring" / "val" / "gpt_50.jsonl"

# Output: transcription synthetic data
OUT_DIR = PROJECT_ROOT / "data" / "transcription" / "val"
AUDIO_DIR = OUT_DIR / "audio"
OUT_METADATA = OUT_DIR / "metadata.csv"

AUDIO_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("CS_SCORING_JSONL exists:", CS_SCORING_JSONL.exists())
print("Audio dir:", AUDIO_DIR)

PROJECT_ROOT: C:\Users\tengc\Downloads\metrics-eval\metrics_eval
CS_SCORING_JSONL exists: True
Audio dir: C:\Users\tengc\Downloads\metrics-eval\metrics_eval\data\transcription\val\audio


In [11]:
from dotenv import load_dotenv

ENV_PATH = Path(r"C:\Users\tengc\Downloads\metrics-eval") / ".env"
load_dotenv(dotenv_path=ENV_PATH)

print("Loaded .env:", ENV_PATH)
print("OPENAI_API_KEY found:", bool(os.getenv("OPENAI_API_KEY")))

Loaded .env: C:\Users\tengc\Downloads\metrics-eval\.env
OPENAI_API_KEY found: True


In [12]:
def extract_message_text(user_content: str) -> str:
    if not isinstance(user_content, str):
        return ""
    s = user_content.strip()
    if s.lower().startswith("message:"):
        s = s.split(":", 1)[1].strip()
    return " ".join(s.split())  # normalize whitespace


rows = []
with CS_SCORING_JSONL.open("r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        obj = json.loads(line)
        messages = obj.get("messages", [])
        user_msg = next((m for m in messages if m.get("role") == "user"), None)
        if not user_msg:
            continue

        text = extract_message_text(user_msg.get("content", ""))
        if text:
            rows.append({"source_idx": i, "text": text})

df = pd.DataFrame(rows).drop_duplicates(subset=["text"]).reset_index(drop=True)

df.head(), len(df)

(   source_idx                                               text
 0           0  Pipe casting quality is excellent, no defects ...
 1           1  Material specs sometimes do not match the draw...
 2           2  Sales team is very responsive on WhatsApp and ...
 3           3  Customer service can be defensive when we rais...
 4           4  Delivery is usually on time but sometimes the ...,
 41)

In [13]:
N_SAMPLES = 30

if len(df) < N_SAMPLES:
    sample_df = df.copy()
else:
    sample_df = df.sample(n=N_SAMPLES, random_state=RANDOM_SEED).copy()

sample_df = sample_df.reset_index(drop=True)
sample_df["id"] = [f"tx_{i:03d}" for i in range(len(sample_df))]

metadata = sample_df[["id", "text"]].copy()
metadata["audio_path"] = ""

OUT_DIR.mkdir(parents=True, exist_ok=True)
metadata.to_csv(OUT_METADATA, index=False, encoding="utf-8")

print("Saved metadata:", OUT_METADATA)
metadata.head()

Saved metadata: C:\Users\tengc\Downloads\metrics-eval\metrics_eval\data\transcription\val\metadata.csv


Unnamed: 0,id,text,audio_path
0,tx_000,Product quality is okay overall but the surfac...,
1,tx_001,Sometimes the delivery driver cannot find the ...,
2,tx_002,"Stock availability for standard sizes is good,...",
3,tx_003,The sales engineer knows our project requireme...,
4,tx_004,Delivery is usually on time but sometimes the ...,


In [14]:
from openai import OpenAI
import time

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
assert OPENAI_API_KEY, "OPENAI_API_KEY not found"

client = OpenAI(api_key=OPENAI_API_KEY)

TTS_MODEL = "gpt-4o-mini-tts"
TTS_VOICE = "alloy"
AUDIO_FORMAT = "mp3"

In [16]:
metadata = pd.read_csv(OUT_METADATA)

def tts_to_file(text: str, out_path: Path) -> float:
    start = time.time()

    # Correct OpenAI TTS call (no `format=` kwarg)
    response = client.audio.speech.create(
        model=TTS_MODEL,
        voice=TTS_VOICE,
        input=text,
    )

    # Write raw audio bytes
    out_path.write_bytes(response.read())

    return time.time() - start


latencies = []

for idx, row in metadata.iterrows():
    audio_path = AUDIO_DIR / f"{row['id']}.{AUDIO_FORMAT}"

    if not audio_path.exists():
        latency = tts_to_file(row["text"], audio_path)
        latencies.append(latency)

    metadata.at[idx, "audio_path"] = str(audio_path).replace("\\", "/")

metadata.to_csv(OUT_METADATA, index=False, encoding="utf-8")

print("Audio generation complete")
if latencies:
    print(f"Avg latency: {sum(latencies)/len(latencies):.2f}s")

  metadata.at[idx, "audio_path"] = str(audio_path).replace("\\", "/")


Audio generation complete
Avg latency: 1.68s


In [17]:
from IPython.display import Audio, display

sample = metadata.sample(1, random_state=RANDOM_SEED).iloc[0]
print("ID:", sample["id"])
print("Text:", sample["text"])
display(Audio(sample["audio_path"]))

ID: tx_027
Text: Product dimensions are accurate and we have no issues fitting them on site.
