In [1]:
import os
import csv
import random
import subprocess
from collections import defaultdict, Counter
import pandas as pd

In [2]:
RANDOM_SEED = 42
random.seed(RANDOM_SEED)


In [None]:
# === CONFIG ===
CSV_PATH = "D:\Data Science\Multi Model Sentiment\\train_sent_emo.csv"
MP4_FOLDER = "D:\Data Science\Multi Model Sentiment\\train_splits"
OUT_AUDIO_FOLDER = "audio_output_4_final"
OUT_METADATA_CSV = "audio_metadata_filtered_4_final.csv"
TOTAL_TO_SELECT = 3000 

def safe_make_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

def check_ffmpeg():
    from shutil import which
    return which("ffmpeg") is not None

def extract_audio_ffmpeg(mp4_path, wav_path):
    import subprocess
    cmd = ["ffmpeg", "-i", mp4_path, "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", wav_path, "-y", "-loglevel", "quiet"]
    try:
        subprocess.run(cmd, check=True)
        return True
    except subprocess.CalledProcessError:
        return False

def build_mp4_filename(dialogue_id, utterance_id):
    return f"dia{dialogue_id}_utt{utterance_id}.mp4"

# === MAIN FUNCTION ===
def main():
    df = pd.read_csv(CSV_PATH)

    required_cols = {'Dialogue_ID', 'Utterance_ID', 'Utterance', 'Emotion'}
    if not required_cols.issubset(df.columns):
        raise ValueError(f"CSV must have columns: {required_cols}")

    allowed_emotions = {'anger','joy', 'sadness','neutral'}
    df = df[df['Emotion'].isin(allowed_emotions)]

    rows_by_emotion = defaultdict(list)
    for _, row in df.iterrows():
        mp4_name = build_mp4_filename(row['Dialogue_ID'], row['Utterance_ID'])
        mp4_path = os.path.join(MP4_FOLDER, mp4_name)
        if os.path.isfile(mp4_path):
            rows_by_emotion[row['Emotion']].append({
                'dialogue_id': row['Dialogue_ID'],
                'utterance_id': row['Utterance_ID'],
                'text': row['Utterance'],
                'emotion': row['Emotion'],
                'mp4': mp4_name,
                'mp4_path': mp4_path
            })

    emotions = sorted(rows_by_emotion.keys())
    print(f"Found {sum(len(v) for v in rows_by_emotion.values())} samples across {len(emotions)} emotions: {emotions}")

    per_emotion = TOTAL_TO_SELECT // len(emotions)
    remainder = TOTAL_TO_SELECT % len(emotions)
    selection = []

    for emo in emotions:
        random.shuffle(rows_by_emotion[emo])
        take = min(per_emotion, len(rows_by_emotion[emo]))
        selection.extend(rows_by_emotion[emo][:take])

    remaining_pool = []
    for emo in emotions:
        remaining_pool.extend(rows_by_emotion[emo][per_emotion:])
    random.shuffle(remaining_pool)
    selection.extend(remaining_pool[:remainder])

    print(f"Selected {len(selection)} samples. Counts per emotion:")
    cnt = Counter([s['emotion'] for s in selection])
    for emo, c in cnt.items():
        print(f"  {emo}: {c}")

    safe_make_dir(OUT_AUDIO_FOLDER)

    ffmpeg_ok = check_ffmpeg()
    metadata_rows = []

    for item in selection:
        mp4_path = item['mp4_path']
        wav_name = item['mp4'].replace('.mp4', '.wav')
        wav_path = os.path.join(OUT_AUDIO_FOLDER, wav_name)
        if not os.path.isfile(wav_path):
            success = extract_audio_ffmpeg(mp4_path, wav_path) if ffmpeg_ok else False
            if not success:
                print(f"Failed to extract {mp4_path}")
                continue
        metadata_rows.append({
            'wav_filename': wav_name,
            'mp4_filename': item['mp4'],
            'dialogue_id': item['dialogue_id'],
            'utterance_id': item['utterance_id'],
            'emotion': item['emotion'],
            'text': item['text']
        })
        print(f"Extracted {wav_name}")

    keys = ['wav_filename', 'mp4_filename', 'dialogue_id', 'utterance_id', 'emotion', 'text']
    with open(OUT_METADATA_CSV, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for r in metadata_rows:
            writer.writerow(r)
    print(f"✅ Metadata CSV written: {OUT_METADATA_CSV} ({len(metadata_rows)} rows)")


if __name__ == "__main__":
    main()


Found 8245 samples across 4 emotions: ['anger', 'joy', 'neutral', 'sadness']
Selected 2933 samples. Counts per emotion:
  anger: 750
  joy: 750
  neutral: 750
  sadness: 683
Extracted dia907_utt4.wav
Extracted dia188_utt5.wav
Extracted dia10_utt2.wav
Extracted dia32_utt12.wav
Extracted dia875_utt6.wav
Extracted dia206_utt13.wav
Extracted dia277_utt3.wav
Extracted dia888_utt3.wav
Extracted dia275_utt1.wav
Extracted dia1025_utt9.wav
Extracted dia272_utt2.wav
Extracted dia873_utt5.wav
Extracted dia771_utt16.wav
Extracted dia39_utt5.wav
Extracted dia615_utt5.wav
Extracted dia28_utt1.wav
Extracted dia82_utt2.wav
Extracted dia615_utt9.wav
Extracted dia81_utt14.wav
Extracted dia819_utt5.wav
Extracted dia592_utt13.wav
Extracted dia148_utt17.wav
Extracted dia887_utt6.wav
Extracted dia523_utt21.wav
Extracted dia336_utt7.wav
Extracted dia833_utt4.wav
Extracted dia395_utt4.wav
Extracted dia69_utt6.wav
Extracted dia255_utt11.wav
Extracted dia835_utt5.wav
Extracted dia189_utt8.wav
Extracted dia151_u