# Count all the german files in the dataset

In [4]:
import pandas as pd

# Define the path to your CSV file
csv_file_path = "/ceph/shared/ALL/datasets/voxceleb2-V2/voxceleb2-language-identification.csv"

# Read the CSV file
df = pd.read_csv(csv_file_path)

# Count rows where detected_language is 'de'
german_count = df[df['detected_language'] == 'de'].shape[0]

print(f"Total rows: {len(df)}")
print(f"German ('de') files found: {german_count}")

Total rows: 1092009
German ('de') files found: 89700


# generate a csv file of german paths

In [5]:
# ...existing code...
# Filter the DataFrame to get only German rows
german_df = df[df['detected_language'] == 'de']

# Define the output path for the new CSV
output_csv_path = "/ceph/shared/ALL/datasets/voxceleb2-V2/voxceleb2-german-only-whisperLarge.csv"

# Save to CSV (index=False prevents writing the row numbers)
german_df.to_csv(output_csv_path, index=False)

print(f"Saved {len(german_df)} German entries to: {output_csv_path}")

Saved 89700 German entries to: /ceph/shared/ALL/datasets/voxceleb2-V2/voxceleb2-german-only-whisperLarge.csv


# extract the german pairs of audio and video

In [6]:
import pandas as pd
import shutil
import os
from tqdm import tqdm

# --- Configuration ---
csv_file_path = "/ceph/shared/ALL/datasets/voxceleb2-V2/voxceleb2-german-only-whisperLarge.csv"

# Source Roots
src_aac_root = "/ceph/shared/ALL/datasets/voxceleb2-V2/dev/aac"
src_mp4_root = "/ceph/shared/ALL/datasets/voxceleb2-V2/dev/mp4"

# Destination Roots
dst_aac_root = "/ceph/shared/ALL/datasets/voxceleb2-V2/VoxCeleb2-German/dev/aac"
dst_mp4_root = "/ceph/shared/ALL/datasets/voxceleb2-V2/VoxCeleb2-German/dev/mp4"

def copy_german_files(csv_path, src_aac, src_mp4, dst_aac, dst_mp4):
    print(f"Reading CSV: {csv_path}")
    df = pd.read_csv(csv_path)
    
    # We iterate through each row in the CSV
    # Each row contains a full path like: /ceph/.../dev/aac/id00015/vUAbwL9omyM/00452.m4a
    
    print(f"Found {len(df)} files to copy.")
    
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Copying Files"):
        full_audio_path = row['file_path']
        
        # 1. Parse the path to extract ID, Sequence, and Filename
        # We assume the structure ends with: .../aac/<id>/<seq>/<filename>
        parts = full_audio_path.split('/')
        
        filename = parts[-1]       # 00452.m4a
        seq_folder = parts[-2]     # vUAbwL9omyM
        id_folder = parts[-3]      # id00015
        
        # 2. Construct Source Paths
        # Audio is already known
        src_audio = full_audio_path
        
        # Video path: replace 'aac' with 'mp4' and extension .m4a with .mp4
        # We construct it manually to be safe
        video_filename = filename.replace('.m4a', '.mp4')
        src_video = os.path.join(src_mp4, id_folder, seq_folder, video_filename)
        
        # 3. Construct Destination Paths (preserving structure)
        # Audio Destination
        dst_audio_dir = os.path.join(dst_aac, id_folder, seq_folder)
        dst_audio_file = os.path.join(dst_audio_dir, filename)
        
        # Video Destination
        dst_video_dir = os.path.join(dst_mp4, id_folder, seq_folder)
        dst_video_file = os.path.join(dst_video_dir, video_filename)
        
        # 4. Create Directories
        os.makedirs(dst_audio_dir, exist_ok=True)
        os.makedirs(dst_video_dir, exist_ok=True)
        
        # 5. Copy Files
        # Copy Audio
        if os.path.exists(src_audio):
            if not os.path.exists(dst_audio_file):
                try:
                    shutil.copy2(src_audio, dst_audio_file)
                except Exception as e:
                    print(f"Error copying audio {src_audio}: {e}")
        else:
            # print(f"Warning: Source audio missing: {src_audio}")
            pass

        # Copy Video
        if os.path.exists(src_video):
            if not os.path.exists(dst_video_file):
                try:
                    shutil.copy2(src_video, dst_video_file)
                except Exception as e:
                    print(f"Error copying video {src_video}: {e}")
        else:
            # print(f"Warning: Source video missing: {src_video}")
            pass

    print("Copy process completed.")

if __name__ == "__main__":
    copy_german_files(csv_file_path, src_aac_root, src_mp4_root, dst_aac_root, dst_mp4_root)

Reading CSV: /ceph/shared/ALL/datasets/voxceleb2-V2/voxceleb2-german-only-whisperLarge.csv
Found 89700 files to copy.


Copying Files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 89700/89700 [43:31<00:00, 34.34it/s]

Copy process completed.





In [None]:
import os

# --- Configuration ---
# The reference folder (the one that has all 714 items)
ref_path = "/ceph/shared/ALL/datasets/voxceleb2-V2/VoxCeleb2-German/dev/aac"

# The target folder (the one with 711 items)
# UPDATE THIS PATH to the actual location of 'vox2_german_video_seg16s'
target_path = "/ceph/shared/ALL/datasets/voxceleb2-V2/VoxCeleb2-German/dev/processedVideos/vox2_german/vox2_german_video_seg16s" 

# --- Processing ---
# Get the list of items in both directories
# We use set() to make mathematical subtraction easy
ref_items = set(os.listdir(ref_path))
target_items = set(os.listdir(target_path))

# Calculate the difference
missing_items = ref_items - target_items

print(f"Items in Reference (aac): {len(ref_items)}")
print(f"Items in Target (seg16s): {len(target_items)}")
print(f"Number of missing items: {len(missing_items)}")
print("-" * 30)

if len(missing_items) > 0:
    print("The following folders are missing in 'vox2_german_video_seg16s':")
    for item in sorted(missing_items):
        print(item)
else:
    print("No missing items found! The folders match.")

FileNotFoundError: [Errno 2] No such file or directory: '/ceph/shared/ALL/datasets/voxceleb2-V2/vox2_german_video_seg16s'

In [1]:
import torch
import os
# Check if CUDA is available
if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available.")

# Returns the name of the Conda environment
print(os.environ.get('CONDA_DEFAULT_ENV'))

GPU is available: NVIDIA RTX PRO 6000 Blackwell Server Edition MIG 4g.96gb
mms-llama-blk-l


In [3]:
import os
import sys
import torch
import torchaudio
import numpy as np

# Mirror terminal export: PYTHONPATH="$(pwd)/fairseq:$(pwd):$PYTHONPATH"
repo_root = "/ceph/home/TUG/olivares-tug/MMS-LLaMA"
fairseq_dir = os.path.join(repo_root, "fairseq")
avhubert_dir = os.path.join(repo_root, "avhubert")
os.environ["PYTHONPATH"] = f"{fairseq_dir}:{repo_root}:{avhubert_dir}:" + os.environ.get("PYTHONPATH", "")

# Ensure repo + fairseq paths before importing fairseq
src_dir = os.path.join(repo_root, "src")
for p in [avhubert_dir, fairseq_dir, repo_root, src_dir]:
    if p not in sys.path:
        sys.path.insert(0, p)

# Avoid AVHubert debug import path that triggers duplicate model registration
if len(sys.argv) == 1:
    sys.argv.append("run")

from fairseq import checkpoint_utils, tasks
from fairseq.data.dictionary import Dictionary
from transformers import AutoTokenizer, WhisperProcessor
from omegaconf import OmegaConf

import src.task
from src.modelSpeech import MMS_LLaMA_Speech
from src.utils import Compose, Normalize, CenterCrop, load_video

device = "cuda" if torch.cuda.is_available() else "cpu"

ckpt_path = "/ceph/home/TUG/olivares-tug/MMS-LLaMA/pretrained_models/mms_llama/1759h/ckpt-1759h.pt"
llm_path = "meta-llama/Llama-3.2-3B"
w2v_path = os.path.join(repo_root, "pretrained_models/avhubert/muavic_multilingual_compatible.pt")

# Load config/task like demo.py (uses checkpoint defaults)
model_overrides = {
    "task": {
        "data": os.path.join(repo_root, "manifest/germanManifest"),
        "label_dir": os.path.join(repo_root, "manifest/germanManifest"),
        "llm_path": llm_path,
        "noise_prob": 0.75,
        "noise_wav": os.path.join(repo_root, "noise/babble_noise.wav"),
        "normalize": True,
    },
    "model": {
        "data": os.path.join(repo_root, "manifest/germanManifest"),
        "w2v_path": w2v_path,
        "llm_path": llm_path,
        "dropout_input": 0.0,
        "w2v_args": None,
        "normalize": True,
        "no_pretrained_weights": False,
        "window_level": False,
        "apply_mask": False,
        "mask_selection": "static",
        "mask_length": 10,
        "mask_other": 0,
        "mask_prob": 0.75,
        "no_mask_overlap": False,
        "mask_channel_selection": "static",
        "mask_channel_length": 64,
        "mask_channel_other": 0,
        "mask_channel_prob": 0.5,
        "no_mask_channel_overlap": False,
        "layerdrop": 0.1,
        "activation_dropout": 0.1,
        "attention_dropout": 0.0,
        "dropout": 0.0,
        "feature_grad_mult": 1.0,
        "freeze_finetune_updates": 0,
        "sr_predictor_layers": 2,
        "qformer_layers": 2,
        "qformer_dim": 1024,
        "queries_per_sec": 3,
        "use_qformer": True,
        "use_sr_predictor": True,
        "whisper_embed_dim": 1024,
        "avhubert_embed_dim": 1024,
        "llama_embed_dim": 3072,
        "modality_fuse": "concat",
        "lora_rank": 16,
        "lora_alpha": 32,
        "target_modules": "q_proj.k_proj.v_proj.o_proj",
    },
    "common": {
        "user_dir": src_dir,
    },
}

_, cfg, task = checkpoint_utils.load_model_ensemble_and_task(
    [ckpt_path],
    arg_overrides=model_overrides,
    strict=False,
)

# Merge with training defaults to populate missing config keys
base_cfg = OmegaConf.load(os.path.join(repo_root, "src/conf/mms-llama.yaml"))
base_cfg.task.data = model_overrides["task"]["data"]
base_cfg.task.label_dir = model_overrides["task"]["label_dir"]
base_cfg.task.llm_path = model_overrides["task"]["llm_path"]
base_cfg.task.noise_prob = model_overrides["task"]["noise_prob"]
base_cfg.task.noise_wav = model_overrides["task"]["noise_wav"]
base_cfg.common.user_dir = src_dir
base_cfg.model.w2v_path = w2v_path
base_cfg.model.llm_path = llm_path

cfg.task = OmegaConf.merge(base_cfg.task, cfg.task)
cfg.model = OmegaConf.merge(base_cfg.model, cfg.model)
cfg.common = OmegaConf.merge(base_cfg.common, cfg.common)

# Build MMS_LLaMA_Speech from checkpoint config
model = MMS_LLaMA_Speech.build_model(cfg.model, task)
with torch.serialization.safe_globals([Dictionary]):
    checkpoint = torch.load(ckpt_path, map_location="cpu", weights_only=False)
model.load_state_dict(checkpoint["model"], strict=False)
model.eval().to(device)

# Prepare processors
tokenizer = AutoTokenizer.from_pretrained(llm_path)
whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-medium.en")
video_transform = Compose([
    Normalize(0.0, 255.0),
    CenterCrop((88, 88)),
    Normalize(0.0, 1.0),
])

audio_path = "/ceph/home/TUG/olivares-tug/datasets/lrs3/lrs3_video_seg24s/test/0Fi83BHQsMA/00002.wav"
video_path = "/ceph/home/TUG/olivares-tug/datasets/lrs3/lrs3_video_seg24s/test/0Fi83BHQsMA/00002.mp4"

# Load raw audio and compute sample length (pre-Whisper padding)
wav, sr = torchaudio.load(audio_path)
if sr != 16000:
    wav = torchaudio.transforms.Resample(sr, 16000)(wav)
wav = wav.squeeze(0)
audio_len_samples = torch.tensor([wav.numel()], dtype=torch.long, device=device)

# Whisper input features
audio_features = whisper_processor(wav.cpu().numpy(), sampling_rate=16000, return_tensors="pt").input_features.to(device)

# Video features
frames = load_video(video_path)
frames = video_transform(frames)
frames = np.expand_dims(frames, axis=-1)
video_tensor = torch.from_numpy(frames.astype(np.float32))
video_tensor = video_tensor.permute(3, 0, 1, 2).unsqueeze(0).to(device)

# Padding mask for video frames
T = video_tensor.shape[2]
padding_mask = torch.zeros((1, T), dtype=torch.bool, device=device)

# Instruction tokens (empty)
instruction_tokens = tokenizer("", return_tensors="pt").input_ids[0].to(device)

# Minimal empty labels list for forward_speech
target_list = [torch.tensor([], dtype=torch.long, device=device)]

with torch.no_grad():
    out = model.forward_speech(
        source={
            "audio": audio_features,
            "video": video_tensor,
            "instruction": [instruction_tokens],
            "audio_lengths": audio_len_samples,
        },
        padding_mask=padding_mask,
        target_list=target_list,
    )

print("melspec shape:", out["melspec"].shape)
print("hidden_states shape:", out["hidden_states"].shape)

  WeightNorm.apply(module, name, dim)
`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 9,175,040 || all params: 3,221,924,864 || trainable%: 0.2848


BertLMHeadModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From ðŸ‘‰v4.50ðŸ‘ˆ onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 9,175,040 || all params: 3,221,924,864 || trainable%: 0.2848
conformer encoder, details={'num_blocks': 12, 'attention_dim': 512, 'attention_heads': 8}
melspec shape: torch.Size([1, 83, 80])
hidden_states shape: torch.Size([1, 5, 3072])
