In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from utils import *
import os, openai
API_KEY = os.getenv('BOSON_API_KEY')
client = openai.Client(
    api_key=API_KEY,
    base_url="https://hackathon.boson.ai/v1"
)


In [28]:
import kagglehub
from pathlib import Path
from pydub import AudioSegment
import numpy as np
path = kagglehub.dataset_download("uwrfkaggler/ravdess-emotional-speech-audio")
import pandas as pd

fs = Path(path).glob('**/*.wav')
df = pd.DataFrame({'f': [str(f) for f in fs]})
df['stem'] = df.f.str.extract(r'.*\\(.*).wav')
df2 = pd.DataFrame(df.stem.str.split('-').tolist(), columns=['modality', 'vocal', 'emotion', 'intensity', 'statement', 'repetition', 'actor']).astype(int)
df_total = df.merge(df2, left_index=True, right_index=True)
statements = ["Kids are talking by the door", "Dogs are sitting by the door"]
emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

df_total['emotion'] = df_total.emotion.apply(lambda x: emotions[x - 1])
df_total['statement'] = df_total.statement.apply(lambda x: statements[x - 1])

### Emotion Tagged Audio 

In [30]:
def generate_audio(transcript, system_prompt=None, additional_messages=None, temperature=0.9, 
                   top_p=0.95, top_k=50, max_tokens=2048, out_name='out.wav', **kwargs):
    additional_messages = [] if additional_messages is None else additional_messages
    system_prompt = system_prompt or 'Generate speech based on the provided sample and transcript. <|scene_desc_start|>The audio is recorded in a quiet room with no noise. The speech is clearly audible and loud.<|scene_desc_end|>'
    resp = client.chat.completions.create(
        model="higgs-audio-generation-Hackathon",
        messages=[  
            {"role": "system", "content": system_prompt},
        ] + additional_messages + [{'role': 'user', 'content': transcript}],
        modalities=["text", "audio"],
        max_completion_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        stream=False,
        stop=["<|eot_id|>", "<|end_of_text|>", "<|audio_eos|>"],
        extra_body={"top_k": top_k},
        **kwargs
    )  
    save_audio(resp, out_name)

def extract_samples(n_samples=None, **kwargs):
    mask = df_total.f.notna()
    for k, v in kwargs.items():
        mask &= (df_total[k] == v)
    samples = []
    masked = df_total[mask]
    n_samples = min(len(masked), n_samples) if n_samples is not None else len(masked)
    for _, sample in df_total[mask].sample(n_samples).iterrows():
        samples += [{'role': 'user', 'content': sample.statement}, 
            {'role': 'assistant', 'content': [to_audio(sample.f, min_vol=-30)]}]
    return samples

transcript = r"""Okay look, I know last round looked bad, but hear me out. I voted early because I didn’t want to look suspicious just sitting there waiting for everyone else to decide. I figured if I threw in a vote fast, we’d get some momentum going and actually talk about something. Then everyone started jumping on the same person, and by the time I thought about changing, the round was basically over.
And yeah, I was quiet after that, but that’s because everyone was talking over each other. I didn’t wanna add noise. I’m paying attention, though. I’ve got a few guesses now that I’ve seen who defended who. If I was mafia, do you really think I’d have played it that sloppy?"""

generate_audio(transcript, out_name='neutral.wav', 
               additional_messages=extract_samples(actor=1, emotion='neutral')) ### neutral
generate_audio(transcript, out_name='really_angry.wav', 
               additional_messages=extract_samples(actor=1, emotion='angry', intensity=2)) ### strong angry

### Audio to Semantic Info

In [31]:
def semantic_distillation(audio, verbose=False, max_tokens=4096, temperature=0.2, top_p=0.95):
    messages = [
            {"role":"system","content":"You are a helpful assistant."},
            {"role":"user","content":[
                {"type":"audio_url","audio_url": {"url":upload_temp(audio)}},
                {"type":"text","text":f"Write a short description about the emotional information in the audio."}
            ]},
        ]
    resp = client.chat.completions.create(
        model="Qwen3-Omni-30B-A3B-Thinking-Hackathon",
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        stream=False,
    )   
    return process_resp(resp, verbose=verbose)

semantic_distillation('really_angry.wav')

"The speaker's tone is defensive and anxious as they earnestly try to justify their actions from the previous round. They explain their reasoning for voting early, expressing regret that they were unable to change their mind as the discussion became chaotic. The emotional undercurrent is one of pleading and sincerity, culminating in a rhetorical question that seeks to convince the listener of their competence and innocence, revealing a deep-seated worry about being perceived as a poor player."

### ASR

In [32]:
def asr(audio, verbose=False, max_tokens=4096, temperature=0.2, top_p=0.95):
    messages = [
            {"role":"system","content":"You are a helpful assistant."},
            {"role":"user","content":[
                {"type":"audio_url","audio_url": {"url":upload_temp(audio)}},
                {"type":"text","text":f"Transcribe this audio."}
            ]},
        ]
    resp = client.chat.completions.create(
        model="Qwen3-Omni-30B-A3B-Thinking-Hackathon",
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        stream=False,
    )   
    return process_resp(resp, verbose=verbose)

asr('really_angry.wav')

"Okay, look, I know last round looked bad, but hear me out. I voted early because I didn't want to look suspicious just sitting there waiting for everyone else to decide. I figured if I threw in a vote fast, we'd get some momentum going and actually talk about something. Then everyone started jumping on the same person, and by the time I thought about changing, the round was basically over. And yeah, I was quiet after that, but that's because everyone was talking over each other. I didn't want to add noise. I'm paying attention though. I've got a few guesses now that I've seen who defended who. If I was mafia, do you really think I'd have played it that sloppy?"