### About Dataset

### Dataset Description

1. CREMA-D

    The CREMA-D database (Cao et al., 2014) comprises 7,442 video clips of 91 actors (48 male, 43 female) aged 20–74 with diverse ethnicities (African American, Asian, Caucasian, Hispanic, Unspecified), speaking 12 phonetically balanced sentences in six basic emotions (anger, disgust, fear, happy, neutral, sad), each rendered at four intensity levels (Low, Medium, High, Unspecified) in English.

#### Filename Annotation
 
* Each of the 7,442 CREMA-D files has a unique filename. The filename consists of a 5-part identifier (e.g., 1001_DFA_ANG_XX_01.mp4). These identifiers define the stimulus characteristics:
* Actor ID (1001 to 1091 for each of 91 actors)

#### Statement:

        IEO = "It's eleven o'clock"
        TIE = "That is exactly what happened"
        IOM = "I'm on my way to the meeting"
        IWW = "I wonder what this is about"
        TAI = "The airplane is almost full"
        MTI = "Maybe tomorrow it will be cold"
        IWL = "I would like a new alarm clock"
        ITH = "I think I have a doctor's appointment"
        DFA = "Don't forget a jacket"
        ITS = "I think I've seen this before"
        TSI = "The surface is slick"
        WSI = "We'll stop in a couple of minutes"
        Emotion (ANG = anger, DIS = disgust, FEA = fear, HAP = happy, NEU = neutral, SAD = sad)
        Emotional intensity (LO = low, MD = medium, HI = high, XX = unspecified)
        Gender (01 = male, 02 = female)



### Tri-model Preprocessing For CREMA-D Dataset: 

In [None]:
# ============================================
# Trimodal preprocessing (CREMA-D): A/V/P
# Audio/Video SAME as your original
# Pose UPDATED: MediaPipe 8 body + 70 face + gap-fill
# OVERWRITES files in processed_features
# ============================================
import os, cv2, time, json, shutil, logging, warnings
import numpy as np
import torch, torchaudio
from tqdm import tqdm

import mediapipe as mp
from torchvision import models
from torchvision.models.efficientnet import EfficientNet_B2_Weights
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from facenet_pytorch import MTCNN

# ----------------------------
# Logging
# ----------------------------
LOG_FILE = 'preprocessing.log'
logging.basicConfig(
    filename=LOG_FILE,
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
warnings.filterwarnings("ignore", category=FutureWarning, module="facenet_pytorch")

# ----------------------------
# Config
# ----------------------------
CREMA_D_ROOT = r"E:\Research_Datasets"
AUDIO_DIR  = os.path.join(CREMA_D_ROOT, "CREMA_D_Audio")
VIDEO_DIR  = os.path.join(CREMA_D_ROOT, "CREMA_D_Videos")
OUTPUT_DIR = os.path.join(CREMA_D_ROOT, "processed_features2")
TEMP_DIR   = os.path.join(OUTPUT_DIR, "temp")
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(TEMP_DIR, exist_ok=True)

TARGET_VIDEO_FRAMES = 90   # 3s * 30fps
TARGET_AUDIO_FRAMES = 150  # 3s * 50Hz wav2vec

EMOTION_MAP = {"ANG":0, "DIS":1, "FEA":2, "HAP":3, "NEU":4, "SAD":5}

# ----------------------------
# MediaPipe setup (pose + face)
# ----------------------------
mp_pose = mp.solutions.pose
mp_face = mp.solutions.face_mesh
pose = mp_pose.Pose(
    static_image_mode=False,
    model_complexity=1,
    enable_segmentation=False,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)
face_mesh = mp_face.FaceMesh(
    static_image_mode=False,
    max_num_faces=1,
    refine_landmarks=True,
    min_detection_confidence=0.5
)

# ----------------------------
# Pose: 8 body (OpenPose order) + 70 face, with gap-filling
# ----------------------------
POSE_VIS_THR = 0.5  # require decent visibility for body points

def _xy_or_none(lms, idx, min_vis=POSE_VIS_THR):
    """Return (x,y) from MediaPipe if visible; else None."""
    if lms is None or idx is None or idx >= len(lms):
        return None
    pt = lms[idx]
    vis = getattr(pt, "visibility", 1.0)
    if vis < min_vis:
        return None
    return (pt.x, pt.y)

def _mid(a, b):
    return None if (a is None or b is None) else ((a[0]+b[0])/2.0, (a[1]+b[1])/2.0)

def extract_pose_features(frame):
    """
    156-dim pose vector:
      body(8): [nose, neck(mid shoulders), R_sh, L_sh, R_eye, L_eye, R_ear, L_ear]
      face(70): first 70 FaceMesh landmarks
    Values: normalized (0..1) image coordinates.
    """
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # --- body (8 points) ---
    body = np.zeros((8, 2), dtype=np.float32)
    pr = pose.process(rgb)
    pl = pr.pose_landmarks.landmark if (pr and pr.pose_landmarks) else None

    nose = _xy_or_none(pl, 0)
    LSH  = _xy_or_none(pl, 11)
    RSH  = _xy_or_none(pl, 12)
    neck = _mid(LSH, RSH)
    REYE = _xy_or_none(pl, 5)  # right eye
    LEYE = _xy_or_none(pl, 2)  # left eye
    REAR = _xy_or_none(pl, 8)  # right ear
    LEAR = _xy_or_none(pl, 7)  # left ear

    ordered = [nose, neck, RSH, LSH, REYE, LEYE, REAR, LEAR]
    for i, p in enumerate(ordered):
        if p is not None:
            body[i] = p

    # --- face (70 points) ---
    face = np.zeros((70, 2), dtype=np.float32)
    fr = face_mesh.process(rgb)
    if fr and fr.multi_face_landmarks:
        fl = fr.multi_face_landmarks[0].landmark
        n = min(70, len(fl))
        for i in range(n):
            face[i] = (fl[i].x, fl[i].y)

    return np.concatenate([body.reshape(-1), face.reshape(-1)], axis=0)  # (156,)

def fill_pose_gaps(pose_arr, max_gap=8):
    """
    pose_arr: (T,156) with zero rows when missing.
    - forward/back-fill edges
    - linear interpolate internal gaps up to 'max_gap'
    - hold-last-sample for longer gaps
    Returns gap-filled array of same shape.
    """
    X = pose_arr.copy()
    T = X.shape[0]
    nonzero_idx = np.where(~np.all(X == 0, axis=1))[0]
    if len(nonzero_idx) == 0:
        return X

    first, last = nonzero_idx[0], nonzero_idx[-1]
    for t in range(0, first):
        X[t] = X[first]
    for t in range(last+1, T):
        X[t] = X[last]

    jumps = np.where(np.diff(nonzero_idx) > 1)[0]
    for g in jumps:
        a = nonzero_idx[g]
        b = nonzero_idx[g+1]
        gap = b - a - 1
        if gap <= 0:
            continue
        if gap <= max_gap:
            for k, t in enumerate(range(a+1, b), start=1):
                alpha = k / (gap + 1)
                X[t] = (1 - alpha) * X[a] + alpha * X[b]
        else:
            for t in range(a+1, b):
                X[t] = X[a]
    return X

# ----------------------------
# Face features (EffNet-B2) and audio (Wav2Vec2) — UNCHANGED
# ----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.info(f"Using device: {device}")

mtcnn = MTCNN(keep_all=False, min_face_size=20, device=device)

def detect_face(frame_bgr):
    """Detect face -> 224x224 tensor, or None (UNCHANGED)."""
    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
    boxes, _ = mtcnn.detect(frame_rgb)
    if boxes is not None and len(boxes) > 0:
        x1, y1, x2, y2 = boxes[0].astype(int)
        x1, y1 = max(0, x1), max(0, y1)
        w, h = x2 - x1, y2 - y1
        roi = frame_rgb[y1:y1+h, x1:x1+w]
        if roi.size == 0:
            return None
        face_resized = cv2.resize(roi, (224, 224))
        return torch.tensor(face_resized).permute(2, 0, 1).float() / 255.0
    return None

efficientnet = models.efficientnet_b2(weights=EfficientNet_B2_Weights.IMAGENET1K_V1)
efficientnet = torch.nn.Sequential(*list(efficientnet.children())[:-1])  # Remove final FC
efficientnet.eval().to(device)

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
    "facebook/wav2vec2-large-robust",
    return_attention_mask=True
)
wav2vec_model = Wav2Vec2Model.from_pretrained(
    "facebook/wav2vec2-large-robust",
    use_safetensors=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float32
)
wav2vec_model.eval().to(device)

# ----------------------------
# Helpers
# ----------------------------
def _safe_replace(src, dst):
    """Move with overwrite (Windows-safe)."""
    if os.path.exists(dst):
        os.remove(dst)
    os.replace(src, dst)

# ----------------------------
# One video
# ----------------------------
def process_single_video(video_path, audio_dir, output_dir, temp_dir):
    """Process one video (A/V/P) and overwrite outputs."""
    sample_id = os.path.basename(video_path).replace('.flv', '')
    start_time = time.time()

    try:
        emotion = sample_id.split("_")[2]
        label = EMOTION_MAP[emotion]
        logging.info(f"Processing {sample_id} (emotion: {emotion})")

        # -------------- read frames (uniformly sample to 90) --------------
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            logging.error(f"Could not open video: {video_path}")
            return False

        total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
        if total > 0:
            idx_keep = np.linspace(0, total - 1, TARGET_VIDEO_FRAMES, dtype=int)
            want = set(idx_keep.tolist())
        else:
            want = None  # fallback: take first 90 frames

        video_feats, pose_vecs = [], []
        i = 0
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            if want is not None and i not in want:
                i += 1
                continue

            # ---- visual features (UNCHANGED) ----
            face_t = detect_face(frame)
            if face_t is not None:
                with torch.no_grad(), torch.amp.autocast(
                    'cuda' if device.type=='cuda' else 'cpu', enabled=(device.type=='cuda')
                ):
                    f = efficientnet(face_t.unsqueeze(0).to(device)).squeeze()
                video_feats.append(f.cpu().numpy())
            else:
                video_feats.append(np.zeros(1408, dtype=np.float32))

            # ---- pose vec (UPDATED) ----
            vec156 = extract_pose_features(frame)
            pose_vecs.append(vec156.astype(np.float32))

            i += 1
            if len(pose_vecs) >= TARGET_VIDEO_FRAMES:
                break
        cap.release()

        # pad if needed
        while len(video_feats) < TARGET_VIDEO_FRAMES:
            video_feats.append(np.zeros(1408, dtype=np.float32))
        while len(pose_vecs) < TARGET_VIDEO_FRAMES:
            pose_vecs.append(np.zeros(156, dtype=np.float32))

        video_np = np.stack(video_feats).astype(np.float32)      # (90, 1408)
        pose_np_raw = np.stack(pose_vecs).astype(np.float32)     # (90, 156)
        pose_np = fill_pose_gaps(pose_np_raw, max_gap=8)         # gap-filled

        # ---- audio (UNCHANGED) ----
        audio_path = os.path.join(audio_dir, sample_id)
        if not os.path.exists(audio_path):
            found = False
            for ext in ['', '.wav', '.WAV', '.wav.wav']:
                alt = os.path.join(audio_dir, f"{sample_id}{ext}")
                if os.path.exists(alt):
                    audio_path = alt
                    found = True
                    break
            if not found:
                logging.error(f"Audio file missing for {sample_id}")
                return False

        waveform, sr = torchaudio.load(audio_path)
        waveform = waveform.squeeze()
        if sr != 16000:
            waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)
        waveform = waveform[:16000 * 3]  # 3 seconds

        inputs = feature_extractor(
            waveform, sampling_rate=16000, return_tensors="pt",
            padding="max_length", max_length=16000*3, truncation=True,
            return_attention_mask=True
        )
        inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}

        with torch.no_grad(), torch.amp.autocast(
            'cuda' if device.type=='cuda' else 'cpu', enabled=(device.type=='cuda')
        ):
            out = wav2vec_model(
                input_values=inputs["input_values"], attention_mask=inputs.get("attention_mask", None)
            )
            audio_features = out.last_hidden_state.squeeze().cpu().float().numpy()  # (T,1024)

        if len(audio_features) > TARGET_AUDIO_FRAMES:
            audio_np = audio_features[:TARGET_AUDIO_FRAMES]
        else:
            pad = TARGET_AUDIO_FRAMES - len(audio_features)
            audio_np = np.vstack([audio_features, np.zeros((pad, 1024), dtype=np.float32)])

        # ---------------- save (atomic overwrite from temp) ----------------
        tmp_paths = []
        tmp_paths.append(os.path.join(TEMP_DIR, f"{sample_id}_label.npy"))
        np.save(tmp_paths[-1], np.array([label], dtype=np.int64))

        tmp_paths.append(os.path.join(TEMP_DIR, f"{sample_id}_video_frames.npy"))
        np.save(tmp_paths[-1], video_np)

        tmp_paths.append(os.path.join(TEMP_DIR, f"{sample_id}_audio_frames.npy"))
        np.save(tmp_paths[-1], audio_np)

        tmp_paths.append(os.path.join(TEMP_DIR, f"{sample_id}_pose.npy"))
        np.save(tmp_paths[-1], pose_np)

        # move with overwrite
        for src in tmp_paths:
            dst = os.path.join(OUTPUT_DIR, os.path.basename(src))
            _safe_replace(src, dst)

        # quick stats to log
        zero_raw = float(np.mean(np.all(pose_np_raw == 0, axis=1)))
        zero_filled = float(np.mean(np.all(pose_np == 0, axis=1)))
        logging.info(f"{sample_id}: pose zero_rows raw={zero_raw:.3f} -> filled={zero_filled:.3f}")
        logging.info(f"Processed {sample_id} in {time.time()-start_time:.2f}s")
        return True

    except Exception as e:
        logging.exception(f"Failed {sample_id}: {e}")
        return False
    finally:
        torch.cuda.empty_cache()

# ----------------------------
# Verify
# ----------------------------
def verify_processed_data(output_dir):
    print("\nVerifying processed data...")
    vids = [f.replace("_video_frames.npy","") for f in os.listdir(output_dir) if f.endswith("_video_frames.npy")]
    if not vids:
        print("No video feature files found.")
        return False
    ok = True
    for sid in vids[:5]:
        video = np.load(os.path.join(output_dir, f"{sid}_video_frames.npy"))
        audio = np.load(os.path.join(output_dir, f"{sid}_audio_frames.npy"))
        pose  = np.load(os.path.join(output_dir, f"{sid}_pose.npy"))

        if video.shape != (TARGET_VIDEO_FRAMES, 1408):
            ok = False; print(f" ! video shape bad for {sid}: {video.shape}")
        if audio.shape != (TARGET_AUDIO_FRAMES, 1024):
            ok = False; print(f" ! audio shape bad for {sid}: {audio.shape}")
        if pose.shape != (TARGET_VIDEO_FRAMES, 156):
            ok = False; print(f" ! pose shape bad for {sid}: {pose.shape}")

        # pose zero rows should be ~0 after fill
        if np.mean(np.all(pose == 0, axis=1)) > 0.01:
            ok = False; print(f" ! residual zero pose rows for {sid}")

    if ok:
        print(" All verification checks passed!")
    return ok

# ----------------------------
# Orchestrator
# ----------------------------
def process_all_videos(video_dir, audio_dir, output_dir, test_mode=False, max_test_samples=10):
    video_files = [f for f in os.listdir(video_dir) if f.lower().endswith(".flv")]
    video_files.sort()
    if test_mode:
        video_files = video_files[:max_test_samples]

    processed = 0
    for vf in tqdm(video_files, desc="Processing videos (A/V/P)"):
        path = os.path.join(video_dir, vf)
        if process_single_video(path, audio_dir, output_dir, TEMP_DIR):
            processed += 1

    print(f"\nDone. processed={processed}/{len(video_files)}")
    return processed

# ----------------------------
# Main
# ----------------------------
if __name__ == "__main__":
    print("="*50)
    print("TRIMODAL PREPROCESSING SCRIPT (A/V/P, overwrite)")
    print("="*50)

    config = {
        "CREMA_D_ROOT": CREMA_D_ROOT,
        "AUDIO_DIR": AUDIO_DIR,
        "VIDEO_DIR": VIDEO_DIR,
        "OUTPUT_DIR": OUTPUT_DIR,
        "TARGET_VIDEO_FRAMES": TARGET_VIDEO_FRAMES,
        "TARGET_AUDIO_FRAMES": TARGET_AUDIO_FRAMES
    }
    with open(os.path.join(CREMA_D_ROOT, "preprocessing_config.json"), "w") as f:
        json.dump(config, f, indent=4)

    print("\nConfiguration:")
    for k, v in config.items():
        print(f"  {k}: {v}")

    # quick test first (10)
    print("\nRunning in TEST MODE (10 samples only)...")
    test_count = process_all_videos(VIDEO_DIR, AUDIO_DIR, OUTPUT_DIR, test_mode=True)
    if test_count > 0:
        print("\nTest mode successful! Verifying test data...")
        if verify_processed_data(OUTPUT_DIR):
            print("\nTest data verified successfully!")
            print("Proceeding to full dataset processing...")

            # FULL RUN (all videos)
            process_all_videos(VIDEO_DIR, AUDIO_DIR, OUTPUT_DIR, test_mode=False)

            # Final verification
            print("\nVerifying full dataset...")
            verify_processed_data(OUTPUT_DIR)

            print("\n" + "="*50)
            print("PREPROCESSING COMPLETE!")
            print("="*50)
        else:
            print("\nTest data verification failed. Check logs for details.")
    else:
        print("\n Test mode failed. Check logs for details.")


### Splitting Data:

In [None]:
import os
import numpy as np
import shutil

# Configuration
PROCESSED_DIR = r"E:\Research_Datasets\processed_features2"
SPLIT_DIR = r"E:\Research_Datasets\processed_features_split2"
os.makedirs(SPLIT_DIR, exist_ok=True)

# Get all unique sample bases (e.g., "1001_DFA_ANG_XX")
samples = set()
for filename in os.listdir(PROCESSED_DIR):
    if filename.endswith("_video_frames.npy"):
        base_name = filename.replace("_video_frames.npy", "")
        samples.add(base_name)

samples = list(samples)
print(f"Found {len(samples)} unique samples")

# Create 5 different splits
for split_num in range(1, 6):
    print(f"\nCreating Split {split_num}...")
    split_path = os.path.join(SPLIT_DIR, f"split_{split_num}")
    os.makedirs(split_path, exist_ok=True)

    # Shuffle a COPY so each split is independent
    samples_shuffled = samples.copy()
    np.random.seed(42 + split_num)
    np.random.shuffle(samples_shuffled)

    # 70% train, 15% val, 15% test
    n = len(samples_shuffled)
    train_samples = samples_shuffled[:int(0.7 * n)]
    val_samples   = samples_shuffled[int(0.7 * n):int(0.85 * n)]
    test_samples  = samples_shuffled[int(0.85 * n):]

    # Create subset folders
    for subset in ["train", "val", "test"]:
        os.makedirs(os.path.join(split_path, subset), exist_ok=True)

    # Copy files for each subset
    for base_name in samples_shuffled:
        for subset, subset_samples in (("train", train_samples),
                                       ("val",   val_samples),
                                       ("test",  test_samples)):
            if base_name in subset_samples:
                for modality in ["video_frames", "audio_frames", "pose", "label"]:
                    src_file = os.path.join(PROCESSED_DIR, f"{base_name}_{modality}.npy")
                    dst_file = os.path.join(split_path, subset, f"{base_name}_{modality}.npy")
                    if os.path.exists(src_file):
                        shutil.copy(src_file, dst_file)
                    else:
                        print(f" Missing file: {src_file}")

    # Verify no sample leakage
    def bases_in(sub):
        p = os.path.join(split_path, sub)
        return {f.replace("_video_frames.npy", "")
                for f in os.listdir(p) if f.endswith("_video_frames.npy")}
    train_files = bases_in("train")
    val_files   = bases_in("val")
    test_files  = bases_in("test")

    assert train_files.isdisjoint(val_files), "Sample leakage between train and val"
    assert train_files.isdisjoint(test_files), "Sample leakage between train and test"
    assert val_files.isdisjoint(test_files),   "Sample leakage between val and test"
    print(f"Split {split_num}: No sample leakage")


### Checking Class Balance:

In [None]:
import os

def verify_splits(split_dir):
    print("\n" + "="*50)
    print("SPLIT VERIFICATION CONFIRMATION")
    print("="*50)

    for split_num in range(1, 6):
        split_path = os.path.join(split_dir, f"split_{split_num}")
        if not os.path.isdir(split_path):
            print(f"Split {split_num}: folder not found -> {split_path}")
            continue

        issues = []

        def get_samples_from_video(subset):
            folder = os.path.join(split_path, subset)
            if not os.path.isdir(folder):
                return set()
            return {
                f.replace("_video_frames.npy", "")
                for f in os.listdir(folder)
                if f.endswith("_video_frames.npy")
            }

        train_samples = get_samples_from_video("train")
        val_samples   = get_samples_from_video("val")
        test_samples  = get_samples_from_video("test")

        # leakage checks
        if train_samples & val_samples:
            issues.append(f"Train-Val leakage: {len(train_samples & val_samples)} samples")
        if train_samples & test_samples:
            issues.append(f"Train-Test leakage: {len(train_samples & test_samples)} samples")
        if val_samples & test_samples:
            issues.append(f"Val-Test leakage: {len(val_samples & test_samples)} samples")

        # file presence check (ALL must be .npy)
        for subset, samples in [("train", train_samples),
                                ("val",   val_samples),
                                ("test",  test_samples)]:
            folder = os.path.join(split_path, subset)
            for sample in samples:
                required_files = [
                    f"{sample}_video_frames.npy",
                    f"{sample}_audio_frames.npy",
                    f"{sample}_pose.npy",
                    f"{sample}_label.npy"
                ]
                missing = [f for f in required_files
                           if not os.path.exists(os.path.join(folder, f))]
                if missing:
                    issues.append(f"Missing files for {sample} in {subset}: {missing}")

        print(f"Split {split_num} sizes — train:{len(train_samples)}  val:{len(val_samples)}  test:{len(test_samples)}")
        if not issues:
            print(f"Split {split_num}:  Perfect — no leakage, files complete")
        else:
            print(f"Split {split_num}:  Issues found:")
            for issue in issues:
                print(f"  - {issue}")


verify_splits(r'E:\Research_Datasets\processed_features_split2')