Setup

In [None]:
# Check GPU
!nvidia-smi

# Install necessary libraries
!pip install transformers accelerate timm torchvision torchaudio pandas tqdm opencv-python
!pip install git+https://github.com/openai/whisper.git


/bin/bash: line 1: nvidia-smi: command not found
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuf

In [None]:
import os
import torch
import torchaudio
import whisper
import pandas as pd
import cv2
import json
from PIL import Image
from tqdm import tqdm
from transformers import BlipProcessor, BlipForConditionalGeneration


In [None]:
# Load BLIP-2 for visual captioning
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")

# Load Whisper for audio transcription
whisper_model = whisper.load_model("small")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [None]:
# AU Intensity Mapping
def map_au_intensity(value):
    if value < 0.2:
        return "barely"
    elif value < 1.0:
        return "slightly"
    elif value < 2.5:
        return "moderately"
    elif value < 5.0:
        return "strongly"
    else:
        return "very strongly"

# Loudness Mapping
def map_loudness(rms):
    if rms < 0.01:
        return "very softly"
    elif rms < 0.03:
        return "softly"
    elif rms < 0.06:
        return "normally"
    elif rms < 0.1:
        return "loudly"
    else:
        return "very loudly"

# AU to facial phrase mapping
AU_PHRASES = {
    "AU01": "raises the inner eyebrows",
    "AU02": "raises the outer eyebrows",
    "AU04": "furrows the brow",
    "AU07": "tightens the eyelids",
    "AU12": "smiles with mouth corners pulled",
    "AU15": "lowers the mouth corners",
    "AU17": "tightens the chin",
    "AU25": "opens the lips",
    "AU26": "drops the jaw"
}

def extract_middle_frame(video_path, save_path):
    try:
        cap = cv2.VideoCapture(video_path)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        middle_frame = frame_count // 2
        cap.set(cv2.CAP_PROP_POS_FRAMES, middle_frame)
        ret, frame = cap.read()
        if ret:
            cv2.imwrite(save_path, frame)
        cap.release()
    except Exception as e:
        print(f"[ERROR] Could not extract frame from {video_path}: {e}")

def caption_image(image_path):
    try:
        raw_image = Image.open(image_path).convert('RGB')
        inputs = processor(raw_image, return_tensors="pt").to("cuda")
        out = blip_model.generate(**inputs)
        caption = processor.decode(out[0], skip_special_tokens=True)
        return caption
    except Exception as e:
        print(f"[ERROR] Visual captioning failed for {image_path}: {e}")
        return "Visual description unavailable."

def whisper_transcribe(audio_path):
    try:
        result = whisper_model.transcribe(audio_path)
        return result['text']
    except Exception as e:
        print(f"[ERROR] Audio transcription failed for {audio_path}: {e}")
        return "Audio transcription unavailable."

def analyze_loudness(audio_path):
    try:
        waveform, sample_rate = torchaudio.load(audio_path)
        rms = waveform.pow(2).mean().sqrt().item()
        return map_loudness(rms)
    except Exception as e:
        print(f"[ERROR] Loudness analysis failed for {audio_path}: {e}")
        return "Unknown loudness."

def parse_au_intensity(openface_csv_path, peak_index):
    try:
        df = pd.read_csv(openface_csv_path)
        if peak_index >= len(df):
            peak_index = len(df) // 2  # fallback to middle if peak invalid
        row = df.iloc[peak_index]

        au_phrases = []
        peak_aus = []

        for au in AU_PHRASES.keys():
            if f"{au}_r" in row:
                value = row[f"{au}_r"]
                if value > 0.1:
                    intensity = map_au_intensity(value)
                    phrase = AU_PHRASES[au]
                    full_phrase = f"{intensity} {phrase}"
                    au_phrases.append(full_phrase)
                    peak_aus.append(au)

        return au_phrases, peak_aus
    except Exception as e:
        print(f"[ERROR] AU parsing failed for {openface_csv_path}: {e}")
        return [], []

def merge_modalities(visual_phrases, audio_phrase, transcript):
    visual_text = " ".join(visual_phrases)
    return (
        f"Visual Description: {visual_text}\n"
        f"Audio Description: {audio_phrase}\n"
        f"Transcript: \"{transcript}\"\n"
        f"Describe what is happening objectively based on these clues."
    )


Upload MELD Data

In [None]:
from google.colab import files
uploaded = files.upload()

# After upload, unzip
!unzip -q train_video.zip -d train_video
!unzip -q train_audio.zip -d train_audio
!unzip -q train_subtitles.zip -d train_subtitles
!unzip -q openface_outputs.zip -d openface_outputs


Load Labels and Balance Subset

In [None]:
# Load MELD labels
df = pd.read_csv('train_labels.csv')

# Balanced sample
samples_per_emotion = 80
balanced_df = df.groupby('Emotion').sample(n=samples_per_emotion, random_state=42)

print(balanced_df['Emotion'].value_counts())


Full Batch Cloning

In [None]:
# Load MELD labels
df = pd.read_csv('train_labels.csv')

# Balanced sample
samples_per_emotion = 80
balanced_df = df.groupby('Emotion').sample(n=samples_per_emotion, random_state=42)

print(balanced_df['Emotion'].value_counts())


In [None]:
results = {}

for idx, row in tqdm(balanced_df.iterrows(), total=len(balanced_df)):
    utt_id = row['Utterance_ID']
    dia_id = row['Dialogue_ID']

    base_name = f"dia{dia_id}_utt{utt_id}"
    video_path = f"train_video/{base_name}.mp4"
    audio_path = f"train_audio/{base_name}.wav"
    subtitle_path = f"train_subtitles/{base_name}.txt"
    openface_csv_path = f"openface_outputs/{base_name}.csv"
    frame_path = f"frame_{base_name}.jpg"

    try:
        extract_middle_frame(video_path, frame_path)

        visual_caption = caption_image(frame_path)
        audio_text = whisper_transcribe(audio_path)
        audio_loudness = analyze_loudness(audio_path)

        transcript = open(subtitle_path, 'r', encoding='utf-8').read().strip()

        df_openface = pd.read_csv(openface_csv_path)
        peak_index = df_openface['frame'].idxmax()

        visual_prior_list, peak_AU_list = parse_au_intensity(openface_csv_path, peak_index)

        merged_caption = merge_modalities(visual_prior_list, audio_loudness, transcript)

        sample_id = f"sample_{idx:08d}"
        results[sample_id] = {
            "AU_list": list(df_openface.columns[df_openface.columns.str.contains('AU')]),
            "visual_prior_list": visual_prior_list,
            "audio_prior_list": audio_loudness,
            "peak_index": int(peak_index),
            "peak_AU_list": peak_AU_list,
            "text": transcript,
            "smp_reason_caption": merged_caption
        }
    except Exception as e:
        print(f"[ERROR] Failed processing {base_name}: {e}")

# Save output
with open('meld_rich_captions.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2)

print(f"Saved {len(results)} samples to meld_rich_captions.json!")
