In [None]:
# Install dependencies
!pip install transformers torchaudio tqdm opencv-python
!pip install git+https://github.com/openai/whisper.git
!pip install git+https://github.com/Salesforce/LAVIS.git  # BLIP-2 via LAVIS

import torch
print("CUDA available?", torch.cuda.is_available())

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [None]:
import os
import json
import cv2
import pandas as pd
import torch
import torchaudio
import whisper
from PIL import Image
from tqdm import tqdm
from transformers import Blip2Processor, Blip2ForConditionalGeneration

# Load models
blip_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
blip_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to("cuda")
whisper_model = whisper.load_model("large")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/122k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

100%|█████████████████████████████████████| 2.88G/2.88G [01:57<00:00, 26.2MiB/s]


In [None]:
AU_PHRASES = {
    "AU01": "raises the inner eyebrows",
    "AU02": "raises the outer eyebrows",
    "AU04": "furrows the brow",
    "AU07": "tightens the eyelids",
    "AU12": "smiles with mouth corners pulled",
    "AU15": "lowers the mouth corners",
    "AU17": "tightens the chin",
    "AU25": "opens the lips",
    "AU26": "drops the jaw"
}

def extract_middle_frame(video_path, save_path):
    cap = cv2.VideoCapture(video_path)
    middle_frame = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) // 2
    cap.set(cv2.CAP_PROP_POS_FRAMES, middle_frame)
    ret, frame = cap.read()
    if ret:
        cv2.imwrite(save_path, frame)
    cap.release()

def caption_image(image_path):
    raw_image = Image.open(image_path).convert("RGB")
    inputs = blip_processor(images=raw_image, return_tensors="pt").to("cuda")
    out = blip_model.generate(**inputs)
    return blip_processor.decode(out[0], skip_special_tokens=True)

def whisper_transcribe(audio_path):
    result = whisper_model.transcribe(audio_path)
    return result['text']

def extract_prosody(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    pitch = torchaudio.functional.detect_pitch_frequency(waveform, sample_rate)
    energy = waveform.pow(2).mean().sqrt().item()
    zcr = waveform.sign().diff(dim=1).abs().mean().item()
    return {
        'pitch_mean': pitch.mean().item(),
        'energy_rms': energy,
        'zcr_rhythm': zcr
    }

def parse_au_intensity(openface_csv_path, peak_index, au_phrases):
    df = pd.read_csv(openface_csv_path)
    if peak_index >= len(df):
        peak_index = len(df) // 2
    row = df.iloc[peak_index]
    output_phrases = []
    for au, phrase in au_phrases.items():
        if f"{au}_r" in row:
            val = row[f"{au}_r"]
            if val > 0.1:
                intensity = (
                    "barely" if val < 0.2 else
                    "slightly" if val < 1.0 else
                    "moderately" if val < 2.5 else
                    "strongly" if val < 5.0 else
                    "very strongly"
                )
                output_phrases.append(f"{intensity} {phrase}")
    return output_phrases

def merge_modalities(visual_phrases, audio_prosody, transcript):
    visual_text = " ".join(visual_phrases)
    prosody_text = f"Pitch={audio_prosody['pitch_mean']:.2f}, Energy={audio_prosody['energy_rms']:.3f}, Rhythm={audio_prosody['zcr_rhythm']:.4f}"
    return (
        f"Visual Cues: {visual_text}\n"
        f"Audio Prosody: {prosody_text}\n"
        f"Transcript: \"{transcript}\"\n"
        f"Describe the emotional context objectively using these signals."
    )

In [None]:
def process_sample(video_path, audio_path, transcript_path, openface_csv_path):
    base_name = os.path.splitext(os.path.basename(video_path))[0]
    frame_path = f"frame_{base_name}.jpg"
    extract_middle_frame(video_path, frame_path)
    visual_caption = caption_image(frame_path)
    audio_transcript = whisper_transcribe(audio_path)
    prosody_features = extract_prosody(audio_path)
    with open(transcript_path, 'r') as f:
        text = f.read().strip()
    df_au = pd.read_csv(openface_csv_path)
    peak_frame = df_au['frame'].idxmax()
    visual_phrases = parse_au_intensity(openface_csv_path, peak_frame, AU_PHRASES)
    merged = merge_modalities(visual_phrases, prosody_features, text)
    return {
        "caption": visual_caption,
        "whisper_text": audio_transcript,
        "prosody": prosody_features,
        "au_phrases": visual_phrases,
        "transcript": text,
        "final_merged": merged
    }

In [None]:
# Upload small test files
from google.colab import files
uploaded = files.upload()

Saving Ses01F_impro01.avi to Ses01F_impro01.avi


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#   sample = process_sample(
#     video_path="your_video.mp4",
#     audio_path="your_audio.wav",
#     transcript_path="your_transcript.txt",
#     openface_csv_path="your_openface_output.csv"
# )
# print(sample["final_merged"])

In [None]:
cap = cv2.VideoCapture("/content/Ses01F_impro01.avi")
print("Is video opened?", cap.isOpened())

Is video opened? True
