# Prétraitement pour DEVA
Ce notebook prépare les données pour l’architecture DEVA : extraction des features audio/visuelles, génération des descriptions émotionnelles, et encodage textuel avancé.

In [36]:
import torch
import torch.nn as nn
import librosa
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel
import cv2
import os
from glob import glob
import re
from mmsdk import mmdatasdk
import pickle
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Paths
mosi_path = "/Users/camillebizeul/Downloads/Projet_Multi-20251201/MOSI"
output_path = "/Users/camillebizeul/Downloads/Projet_Multi-20251201/preprocessed_data"
os.makedirs(output_path, exist_ok=True)

print(f"MOSI path: {mosi_path}")
print(f"Output path: {output_path}")

Using device: cpu
MOSI path: /Users/camillebizeul/Downloads/Projet_Multi-20251201/MOSI
Output path: /Users/camillebizeul/Downloads/Projet_Multi-20251201/preprocessed_data


## 1. Load MOSI Labels from SDK

In [37]:
# Load MOSI labels using SDK
print("Loading MOSI labels from SDK...")
dataset = mmdatasdk.mmdataset(mmdatasdk.cmu_mosi.labels, os.path.join(mosi_path, 'SDK_data'))
opinion_labels_sdk = dataset.computational_sequences['Opinion Segment Labels']

print(f"✓ Loaded labels for {len(opinion_labels_sdk.data)} videos")

def get_sentiment_label(video_id, segment_id):
    """Retrieve sentiment label from SDK."""
    try:
        key = video_id
        if key in opinion_labels_sdk.data:
            intervals = opinion_labels_sdk.data[key]['intervals']
            features = opinion_labels_sdk.data[key]['features']
            segment_idx = int(segment_id) - 1
            if 0 <= segment_idx < len(features):
                sentiment_score = features[segment_idx][0]
                return float(sentiment_score)
        return None
    except Exception as e:
        return None

Loading MOSI labels from SDK...
[94m[1m[2025-12-02 10:09:52.041] | Status  | [0mDownloading from http://immortal.multicomp.cs.cmu.edu/CMU-MOSI/labels/CMU_MOSI_Opinion_Labels.csd to /Users/camillebizeul/Downloads/Projet_Multi-20251201/MOSI/SDK_data/CMU_MOSI_Opinion_Labels.csd...
[94m[1m[2025-12-02 10:09:52.041] | Status  | [0mDownloading from http://immortal.multicomp.cs.cmu.edu/CMU-MOSI/labels/CMU_MOSI_Opinion_Labels.csd to /Users/camillebizeul/Downloads/Projet_Multi-20251201/MOSI/SDK_data/CMU_MOSI_Opinion_Labels.csd...


                                                                

[92m[1m[2025-12-02 10:09:54.066] | Success | [0mDownload complete!
[92m[1m[2025-12-02 10:09:54.070] | Success | [0mComputational sequence read from file /Users/camillebizeul/Downloads/Projet_Multi-20251201/MOSI/SDK_data/CMU_MOSI_Opinion_Labels.csd ...
[94m[1m[2025-12-02 10:09:54.076] | Status  | [0mChecking the integrity of the <Opinion Segment Labels> computational sequence ...
[94m[1m[2025-12-02 10:09:54.076] | Status  | [0mChecking the format of the data in <Opinion Segment Labels> computational sequence ...


                                                                   

[92m[1m[2025-12-02 10:09:54.099] | Success | [0m<Opinion Segment Labels> computational sequence data in correct format.
[94m[1m[2025-12-02 10:09:54.099] | Status  | [0mChecking the format of the metadata in <Opinion Segment Labels> computational sequence ...
[92m[1m[2025-12-02 10:09:54.099] | Success | [0mDataset initialized successfully ... 
✓ Loaded labels for 93 videos




## 2. Initialize Models


In [38]:
# Initialize BERT
print("Loading BERT model...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert = BertModel.from_pretrained("bert-base-uncased").to(device)
bert.eval()
print("✓ BERT loaded")

# TextEncoder class (matches training notebook architecture)
class TextEncoder(nn.Module):
    def __init__(self, bert_dim=768, num_output_tokens=8, nhead=8):
        super().__init__()
        self.bert_dim = bert_dim
        self.T = num_output_tokens
        
        # Token spécial E_m (learnable)
        self.special_token = nn.Parameter(torch.randn(1, 1, bert_dim))
        
        # TransformerEncoderLayer
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=bert_dim,
            nhead=nhead,
            dim_feedforward=2048,
            dropout=0.1,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=1)
        
    def forward(self, input_ids, attention_mask):
        # Obtenir la séquence brute de BERT
        with torch.no_grad():
            bert_output = bert(input_ids=input_ids, attention_mask=attention_mask)
            bert_sequence = bert_output.last_hidden_state  # [batch_size, seq_len, 768]
        
        batch_size = bert_sequence.size(0)
        seq_len = bert_sequence.size(1)
        
        # Ajouter le token spécial E_m en tête
        special_token_expanded = self.special_token.expand(batch_size, -1, -1)
        sequence_with_token = torch.cat([special_token_expanded, bert_sequence], dim=1)
        
        # Appliquer le TransformerEncoderLayer
        enhanced_sequence = self.transformer_encoder(sequence_with_token)
        
        # Ne conserver que les T=8 premiers tokens (ou padding si moins)
        if enhanced_sequence.size(1) >= self.T:
            output_sequence = enhanced_sequence[:, :self.T, :]  # [batch_size, 8, 768]
        else:
            # Si moins de T tokens, faire du padding
            padding_size = self.T - enhanced_sequence.size(1)
            padding = torch.zeros(batch_size, padding_size, self.bert_dim, device=enhanced_sequence.device)
            output_sequence = torch.cat([enhanced_sequence, padding], dim=1)
        
        return output_sequence

# Initialize TextEncoder
text_encoder = TextEncoder().to(device)
text_encoder.eval()
print("✓ TextEncoder initialized (matches training architecture)")

# Simple Image Encoder
class ImgEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(3, 8, 3, 1, 1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1)
        )
    
    def forward(self, x):
        x = x / 255.
        x = torch.tensor(x).permute(2, 0, 1).unsqueeze(0).float().to(device)
        return self.model(x).flatten()

img_encoder = ImgEncoder().to(device)
img_encoder.eval()
print("✓ Image encoder initialized")


Loading BERT model...
✓ BERT loaded
✓ TextEncoder initialized (matches training architecture)
✓ Image encoder initialized
✓ BERT loaded
✓ TextEncoder initialized (matches training architecture)
✓ Image encoder initialized


## 3. Feature Extraction Functions

In [39]:
sr = 16000

def extract_audio_features(audio, sr=16000):
    """Extract emotional audio features."""
    pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)
    pitch = np.mean([pitches[magnitudes[:, t].argmax(), t] 
                     for t in range(pitches.shape[1]) 
                     if magnitudes[:, t].max() > 0] or [0])
    
    rms = librosa.feature.rms(y=audio)[0]
    loudness = np.mean(rms)
    
    pitch_values = [pitches[magnitudes[:, t].argmax(), t] 
                    for t in range(pitches.shape[1]) 
                    if magnitudes[:, t].max() > 0]
    jitter = np.std(pitch_values) if len(pitch_values) > 1 else 0
    shimmer = np.std(rms)
    
    return {
        'pitch': float(pitch),
        'loudness': float(loudness),
        'jitter': float(jitter),
        'shimmer': float(shimmer)
    }

def audio_description(pitch, loudness, jitter, shimmer):
    """Convert audio features to emotional description."""
    pitch_level = "high" if pitch > 200 else "low" if pitch > 100 else "very low"
    loudness_level = "loud" if loudness > 0.1 else "moderate" if loudness > 0.05 else "quiet"
    jitter_level = "high variation" if jitter > 20 else "stable"
    shimmer_level = "variable amplitude" if shimmer > 0.02 else "steady amplitude"
    
    return (
        f"The speaker used {pitch_level} pitch with {loudness_level} volume. "
        f"The voice shows {jitter_level} in pitch and {shimmer_level}."
    )

AU_DESCRIPTIONS = {
    'AU1': 'raised inner brow', 'AU2': 'raised outer brow', 'AU4': 'lowered brow',
    'AU5': 'upper lid raise', 'AU6': 'cheek raise', 'AU7': 'lid tightener',
    'AU9': 'nose wrinkle', 'AU10': 'upper lip raise', 'AU12': 'lip corner pull',
    'AU15': 'lip corner depress', 'AU17': 'chin raise', 'AU20': 'lip stretch',
    'AU23': 'lip tightener', 'AU25': 'lips part', 'AU26': 'jaw drop', 'AU45': 'blink'
}

def visual_description(aus):
    """Convert Action Units to emotional description."""
    if isinstance(aus, dict):
        active_aus = [au for au, active in aus.items() if active]
    else:
        active_aus = aus
    
    if not active_aus:
        return "The person shows a neutral expression with no significant facial movement."
    
    descriptions = [AU_DESCRIPTIONS.get(au, au) for au in active_aus]
    
    if len(descriptions) == 1:
        return f"The person shows signs of: {descriptions[0]}."
    elif len(descriptions) == 2:
        return f"The person shows signs of: {descriptions[0]} and {descriptions[1]}."
    else:
        desc_list = ", ".join(descriptions[:-1]) + f", and {descriptions[-1]}"
        return f"The person shows signs of: {desc_list}."

def encode_text_with_text_encoder(text):
    """
    Encode text using TextEncoder (matches training pipeline).
    Returns mean-pooled embedding from the enhanced sequence.
    """
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        # Use TextEncoder to get enhanced sequence [1, 8, 768]
        sequence = text_encoder(tokens['input_ids'], tokens['attention_mask'])
        # Mean pool the sequence to get single embedding [1, 768]
        emb = sequence.mean(dim=1)
    return emb.cpu().numpy().flatten()

def encode_audio_mfcc(audio):
    """Extract MFCC features from audio."""
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20)
    return mfcc.mean(axis=1)

print("✓ Feature extraction functions defined (using TextEncoder for all text)")


✓ Feature extraction functions defined (using TextEncoder for all text)


## 4. Data Loading & Preprocessing

In [40]:
def parse_transcript_file(filepath):
    """Parse transcript file."""
    segments = []
    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            line = line.strip()
            if '_DELIM_' in line:
                parts = line.split('_DELIM_', 1)
                if len(parts) == 2:
                    seg_id = parts[0].strip()
                    text = parts[1].strip()
                    segments.append({'id': seg_id, 'text': text})
    return segments

def extract_video_id(filename):
    """Extract video ID from filename."""
    base = os.path.splitext(os.path.basename(filename))[0]
    parts = base.rsplit('_', 1)
    return parts[0] if len(parts) > 1 else base

def preprocess_mosi_dataset(max_samples=500):
    """Preprocess MOSI dataset and extract all features using TextEncoder."""
    
    audio_dir = os.path.join(mosi_path, "Audio", "WAV_16000", "Segmented")
    video_dir = os.path.join(mosi_path, "Video", "Segmented")
    transcript_dir = os.path.join(mosi_path, "Transcript", "Segmented")
    
    audio_files = sorted(glob(os.path.join(audio_dir, "*.wav")))
    
    data_records = []
    
    print(f"\nProcessing up to {max_samples} samples...")
    
    for audio_file in tqdm(audio_files[:max_samples], desc="Processing samples"):
        try:
            basename = os.path.basename(audio_file)
            base_no_ext = os.path.splitext(basename)[0]
            video_id = extract_video_id(audio_file)
            
            seg_match = re.search(r'_(\d+)$', base_no_ext)
            if not seg_match:
                continue
            seg_id = seg_match.group(1)
            
            # Get label
            sentiment_label = get_sentiment_label(video_id, seg_id)
            if sentiment_label is None:
                continue
            
            # Load and process audio
            audio, _ = librosa.load(audio_file, sr=sr)
            audio_duration = len(audio) / sr
            
            # Extract audio features
            audio_mfcc = encode_audio_mfcc(audio)
            audio_emotional_features = extract_audio_features(audio, sr)
            audio_desc = audio_description(
                audio_emotional_features['pitch'],
                audio_emotional_features['loudness'],
                audio_emotional_features['jitter'],
                audio_emotional_features['shimmer']
            )
            # Use TextEncoder for audio description
            audio_desc_embedding = encode_text_with_text_encoder(audio_desc)
            
            # Load and process video
            video_file = os.path.join(video_dir, f"{video_id}_{seg_id}.mp4")
            if os.path.exists(video_file):
                cap = cv2.VideoCapture(video_file)
                frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
                if frame_count > 0:
                    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_count // 2)
                    ret, frame = cap.read()
                    if ret:
                        image = cv2.resize(frame, (64, 64))
                        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                    else:
                        image = np.zeros((64, 64, 3), dtype=np.uint8)
                else:
                    image = np.zeros((64, 64, 3), dtype=np.uint8)
                cap.release()
            else:
                image = np.zeros((64, 64, 3), dtype=np.uint8)
            
            # Extract visual features
            visual_cnn = img_encoder(image).detach().cpu().numpy()
            
            # Simulate AUs based on image intensity
            mean_intensity = np.mean(image)
            if mean_intensity > 170:
                aus = ['AU6', 'AU12']
            elif mean_intensity > 85:
                aus = ['AU1', 'AU2']
            else:
                aus = ['AU4', 'AU15']
            
            visual_desc = visual_description(aus)
            # Use TextEncoder for visual description
            visual_desc_embedding = encode_text_with_text_encoder(visual_desc)
            
            # Load transcript
            transcript_file = os.path.join(transcript_dir, f"{video_id}.annotprocessed")
            text = ""
            if os.path.exists(transcript_file):
                segments = parse_transcript_file(transcript_file)
                for seg in segments:
                    if seg['id'] == seg_id:
                        text = seg['text']
                        break
            if not text:
                text = f"Segment {seg_id}"
            
            # Encode text using TextEncoder
            text_embedding = encode_text_with_text_encoder(text)
            
            # Create record
            record = {
                'video_id': video_id,
                'segment_id': seg_id,
                'text': text,
                'text_embedding': text_embedding,
                'audio_mfcc': audio_mfcc,
                'audio_pitch': audio_emotional_features['pitch'],
                'audio_loudness': audio_emotional_features['loudness'],
                'audio_jitter': audio_emotional_features['jitter'],
                'audio_shimmer': audio_emotional_features['shimmer'],
                'audio_description': audio_desc,
                'audio_desc_embedding': audio_desc_embedding,
                'visual_cnn': visual_cnn,
                'visual_description': visual_desc,
                'visual_desc_embedding': visual_desc_embedding,
                'audio_duration': audio_duration,
                'sentiment_label': sentiment_label
            }
            
            data_records.append(record)
            
        except Exception as e:
            print(f"\nError processing {basename}: {e}")
            continue
    
    return data_records

print("✓ Data preprocessing function defined (uses TextEncoder for all text embeddings)")


✓ Data preprocessing function defined (uses TextEncoder for all text embeddings)


## 5. Process Dataset

In [41]:
# Process the dataset (adjust max_samples as needed)
print("\n" + "="*70)
print("STARTING DATA PREPROCESSING")
print("="*70)

data_records = preprocess_mosi_dataset(max_samples=500)

print(f"\n✓ Processed {len(data_records)} samples successfully")


STARTING DATA PREPROCESSING

Processing up to 500 samples...


Processing samples: 100%|██████████| 500/500 [00:49<00:00, 10.10it/s]


✓ Processed 500 samples successfully





## 6. Create DataFrame & Save

In [42]:
# Create DataFrame
df = pd.DataFrame(data_records)

print("\n" + "="*70)
print("DATASET STATISTICS")
print("="*70)
print(f"Total samples: {len(df)}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nSentiment label statistics:")
print(f"  Min: {df['sentiment_label'].min():.3f}")
print(f"  Max: {df['sentiment_label'].max():.3f}")
print(f"  Mean: {df['sentiment_label'].mean():.3f}")
print(f"  Std: {df['sentiment_label'].std():.3f}")
print(f"\nLabel distribution:")
print(f"  Positive (>0): {(df['sentiment_label'] > 0).sum()}")
print(f"  Negative (<0): {(df['sentiment_label'] < 0).sum()}")
print(f"  Neutral (=0): {(df['sentiment_label'] == 0).sum()}")

# Display sample
print("\n" + "="*70)
print("SAMPLE DATA (first 3 rows)")
print("="*70)
for idx in range(min(3, len(df))):
    print(f"\nSample {idx+1}:")
    print(f"  Video ID: {df.iloc[idx]['video_id']}")
    print(f"  Segment ID: {df.iloc[idx]['segment_id']}")
    print(f"  Text: {df.iloc[idx]['text'][:80]}...")
    print(f"  Audio Description: {df.iloc[idx]['audio_description']}")
    print(f"  Visual Description: {df.iloc[idx]['visual_description']}")
    print(f"  Sentiment Label: {df.iloc[idx]['sentiment_label']:+.3f}")
    print(f"  Text embedding shape: {df.iloc[idx]['text_embedding'].shape}")
    print(f"  Audio MFCC shape: {df.iloc[idx]['audio_mfcc'].shape}")
    print(f"  Visual CNN shape: {df.iloc[idx]['visual_cnn'].shape}")


DATASET STATISTICS
Total samples: 500

Columns: ['video_id', 'segment_id', 'text', 'text_embedding', 'audio_mfcc', 'audio_pitch', 'audio_loudness', 'audio_jitter', 'audio_shimmer', 'audio_description', 'audio_desc_embedding', 'visual_cnn', 'visual_description', 'visual_desc_embedding', 'audio_duration', 'sentiment_label']

Sentiment label statistics:
  Min: -2.800
  Max: 3.000
  Mean: 0.417
  Std: 1.451

Label distribution:
  Positive (>0): 280
  Negative (<0): 197
  Neutral (=0): 23

SAMPLE DATA (first 3 rows)

Sample 1:
  Video ID: 03bSnISJMiM
  Segment ID: 1
  Text: ANYHOW IT WAS REALLY GOOD...
  Audio Description: The speaker used high pitch with quiet volume. The voice shows high variation in pitch and steady amplitude.
  Visual Description: The person shows signs of: lowered brow and lip corner depress.
  Sentiment Label: +2.400
  Text embedding shape: (768,)
  Audio MFCC shape: (20,)
  Visual CNN shape: (8,)

Sample 2:
  Video ID: 03bSnISJMiM
  Segment ID: 10
  Text: THERE IS S

In [43]:
# Save DataFrame
print("\n" + "="*70)
print("SAVING PREPROCESSED DATA")
print("="*70)

# Save as pickle (preserves numpy arrays)
pickle_path = os.path.join(output_path, 'mosi_preprocessed.pkl')
with open(pickle_path, 'wb') as f:
    pickle.dump(df, f)
print(f"✓ Saved pickle file: {pickle_path}")

# Save metadata as CSV (without embeddings)
metadata_df = df[['video_id', 'segment_id', 'text', 'audio_description', 'visual_description', 
                  'audio_pitch', 'audio_loudness', 'audio_jitter', 'audio_shimmer',
                  'audio_duration', 'sentiment_label']].copy()
csv_path = os.path.join(output_path, 'mosi_metadata.csv')
metadata_df.to_csv(csv_path, index=False)
print(f"✓ Saved metadata CSV: {csv_path}")

# Save feature dimensions info
info = {
    'num_samples': len(df),
    'text_embedding_dim': df.iloc[0]['text_embedding'].shape[0],
    'audio_mfcc_dim': df.iloc[0]['audio_mfcc'].shape[0],
    'visual_cnn_dim': df.iloc[0]['visual_cnn'].shape[0],
    'audio_desc_embedding_dim': df.iloc[0]['audio_desc_embedding'].shape[0],
    'visual_desc_embedding_dim': df.iloc[0]['visual_desc_embedding'].shape[0],
    'columns': list(df.columns)
}

info_path = os.path.join(output_path, 'dataset_info.pkl')
with open(info_path, 'wb') as f:
    pickle.dump(info, f)
print(f"✓ Saved dataset info: {info_path}")

print("\n" + "="*70)
print("PREPROCESSING COMPLETE!")
print("="*70)
print(f"\nFiles saved in: {output_path}")
print(f"  - mosi_preprocessed.pkl (full dataset with embeddings)")
print(f"  - mosi_metadata.csv (metadata without embeddings)")
print(f"  - dataset_info.pkl (feature dimensions and info)")


SAVING PREPROCESSED DATA
✓ Saved pickle file: /Users/camillebizeul/Downloads/Projet_Multi-20251201/preprocessed_data/mosi_preprocessed.pkl
✓ Saved metadata CSV: /Users/camillebizeul/Downloads/Projet_Multi-20251201/preprocessed_data/mosi_metadata.csv
✓ Saved dataset info: /Users/camillebizeul/Downloads/Projet_Multi-20251201/preprocessed_data/dataset_info.pkl

PREPROCESSING COMPLETE!

Files saved in: /Users/camillebizeul/Downloads/Projet_Multi-20251201/preprocessed_data
  - mosi_preprocessed.pkl (full dataset with embeddings)
  - mosi_metadata.csv (metadata without embeddings)
  - dataset_info.pkl (feature dimensions and info)


In [47]:
# Extraction et génération des features AED/VED pour chaque échantillon
# Fonctionne même si 'audio'/'aus' ne sont pas présents dans df
preprocessed_audio_desc = []
preprocessed_visual_desc = []

has_audio = 'audio' in df.columns
has_aus = 'aus' in df.columns

for idx, row in df.iterrows():
    # Audio description
    if has_audio:
        feats = extract_audio_features(row['audio'], sr=sr)  # retourne un dict
        aed = audio_description(feats['pitch'], feats['loudness'], feats['jitter'], feats['shimmer'])
    elif all(c in df.columns for c in ['audio_pitch','audio_loudness','audio_jitter','audio_shimmer']):
        aed = audio_description(row['audio_pitch'], row['audio_loudness'], row['audio_jitter'], row['audio_shimmer'])
    else:
        aed = row['audio_description'] if 'audio_description' in df.columns else ""
    preprocessed_audio_desc.append(aed)

    # Visual description
    if has_aus:
        ved = visual_description(row['aus'])
    else:
        ved = row['visual_description'] if 'visual_description' in df.columns else ""
    preprocessed_visual_desc.append(ved)

# Ajout des colonnes harmonisées
df['audio_desc'] = preprocessed_audio_desc
df['visual_desc'] = preprocessed_visual_desc
