In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import glob
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import BatchNormalization
import pandas as pd
import numpy as np
from pathlib import Path
import glob
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')



2025-12-16 13:59:08.867175: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
class FinalMultimodalDataProcessor:
    def __init__(self, base_path="../DatasetCercetare"):
        self.base_path = base_path
        self.sessions_data = {}

    def get_all_sessions(self):
        """Get all unique session names from the file structure"""
        # Get session names from AudioFeatures (since they're shared)
        audio_files = glob.glob(f"{self.base_path}/AudioFeatures/*.csv")
        sessions = [Path(file).stem for file in audio_files]
        return sessions

    def get_people_in_session(self, session_name):
        """Get the people involved in a session from ActionUnits files"""
        au_files = glob.glob(f"{self.base_path}/ActionUnits/*_on_{session_name}.csv")
        people = []
        for file in au_files:
            filename = Path(file).stem
            person = filename.split('_on_')[0]
            people.append(person)
        return people

    def load_person_data(self, person, session_name):
        """Load data for a specific person in a session"""
        person_data = {}

        print(f"  Loading data for {person} in session {session_name}")

        # Load Action Units for this person
        au_file = f"{self.base_path}/ActionUnits/{person}_on_{session_name}.csv"
        if Path(au_file).exists():
            au_df = pd.read_csv(au_file)
            au_features = ['AU01', 'AU02', 'AU04', 'AU05', 'AU06', 'AU07', 'AU09',
                          'AU10', 'AU12', 'AU14', 'AU15', 'AU17', 'AU20', 'AU23',
                          'AU25', 'AU26', 'AU28']
            person_data['action_units'] = au_df[['timestamp'] + au_features]
            print(f"    Action Units: {au_df.shape}")

        # Load Hand Gestures for this person
        hg_file = f"{self.base_path}/HandGestures/{person}_on_{session_name}.csv"
        if Path(hg_file).exists():
            hg_df = pd.read_csv(hg_file)
            hg_features = ['left_hand_velocity', 'right_hand_velocity',
                          'gesture_frequency_cumulative', 'face_touches_cumulative']
            person_data['hand_gestures'] = hg_df[['timestamp'] + hg_features]
            print(f"    Hand Gestures: {hg_df.shape}")

        # Load shared Audio Features (same for all people in session)
        audio_file = f"{self.base_path}/AudioFeatures/{session_name}.csv"
        if Path(audio_file).exists():
            audio_df = pd.read_csv(audio_file)
            audio_features = ['energy_db', 'pitch_hz', 'speaking_rate']
            audio_df = audio_df.rename(columns={'time_seconds': 'timestamp'})
            person_data['audio'] = audio_df[['timestamp'] + audio_features]
            print(f"    Audio Features: {audio_df.shape}")

        # Load shared Sentiment Analysis (filter by speaker if available)
        sent_file = f"{self.base_path}/SentimentAnalysis/{session_name}.csv"
        if Path(sent_file).exists():
            sent_df = pd.read_csv(sent_file)
            sent_df = sent_df.rename(columns={'second': 'timestamp'})

            # Filter by speaker if the person name matches
            if 'speaker' in sent_df.columns:
                # Try to match person name with speaker (case insensitive)
                person_sent = sent_df[sent_df['speaker'].str.lower() == person.lower()]
                if len(person_sent) > 0:
                    person_data['sentiment'] = person_sent[['timestamp', 'compound', 'pos', 'neu', 'neg']]
                    print(f"    Sentiment (filtered for {person}): {person_sent.shape}")
                else:
                    # If no match, use aggregated sentiment for all speakers
                    sent_agg = sent_df.groupby('timestamp').agg({
                        'compound': 'mean', 'pos': 'mean', 'neu': 'mean', 'neg': 'mean'
                    }).reset_index()
                    person_data['sentiment'] = sent_agg
                    print(f"    Sentiment (aggregated): {sent_agg.shape}")
            else:
                person_data['sentiment'] = sent_df[['timestamp', 'compound', 'pos', 'neu', 'neg']]
                print(f"    Sentiment: {sent_df.shape}")

        return person_data

    def align_person_data(self, person_data, target_fps=1.0):
        """Align all modalities for a person to the same temporal grid"""

        # Find common time range
        min_time = 0
        max_time = float('inf')

        for modality, data in person_data.items():
            if len(data) > 0:
                min_time = max(min_time, data['timestamp'].min())
                max_time = min(max_time, data['timestamp'].max())

        # Create target timeline
        target_timeline = np.arange(int(min_time), int(max_time) + 1)
        aligned_data = pd.DataFrame({'timestamp': target_timeline})

        # Align each modality
        for modality, data in person_data.items():
            if modality == 'audio':
                # Aggregate high-frequency audio to 1-second intervals
                audio_agg = data.groupby(data['timestamp'].round()).agg({
                    'energy_db': 'mean',
                    'pitch_hz': 'mean',
                    'speaking_rate': 'mean'
                }).reset_index()
                aligned_data = aligned_data.merge(audio_agg, on='timestamp', how='left')

            else:
                # For other modalities, use nearest second matching
                data_rounded = data.copy()
                data_rounded['timestamp'] = data_rounded['timestamp'].round().astype(int)
                data_agg = data_rounded.groupby('timestamp').first().reset_index()
                aligned_data = aligned_data.merge(data_agg, on='timestamp', how='left')

        # Fill missing values
        aligned_data = aligned_data.fillna(method='ffill').fillna(0)

        return aligned_data

    def process_all_data(self):
        """Process all sessions and people"""
        sessions = self.get_all_sessions()
        print(f"Found sessions: {sessions}")

        all_processed_data = {}

        for session in sessions:
            print(f"\n=== Processing Session: {session} ===")
            people = self.get_people_in_session(session)
            print(f"People in session: {people}")

            session_data = {}

            for person in people:
                # Load person's data
                person_data = self.load_person_data(person, session)

                # Align temporal data
                aligned_data = self.align_person_data(person_data)

                # Add person and session info
                aligned_data['person'] = person
                aligned_data['session'] = session

                print(f"    {person} final shape: {aligned_data.shape}")
                print(f"    {person} features: {[col for col in aligned_data.columns if col not in ['timestamp', 'person', 'session']]}")

                session_data[person] = aligned_data
                all_processed_data[f"{session}_{person}"] = aligned_data

            self.sessions_data[session] = session_data

        return all_processed_data

# Process all data
processor = FinalMultimodalDataProcessor()
all_data = processor.process_all_data()

print("\n" + "="*60)
print("FINAL DATA SUMMARY")
print("="*60)
for key, data in all_data.items():
    print(f"{key}:")
    print(f"  Shape: {data.shape}")
    print(f"  Duration: {data['timestamp'].max() - data['timestamp'].min():.0f} seconds")
    print(f"  Features: {len([col for col in data.columns if col not in ['timestamp', 'person', 'session']])}")
    print()

Found sessions: ['NateAlexis', 'EliGianna', 'DaemahniGianna', 'SarahTexas', 'StephenMiette', 'ZahariahErin', 'ChaseGianna', 'StephenKeala', 'MarshallBritney']

=== Processing Session: NateAlexis ===
People in session: ['Alexis', 'Nate']
  Loading data for Alexis in session NateAlexis
    Action Units: (193, 18)
    Hand Gestures: (104, 15)
    Audio Features: (8318, 12)
    Alexis final shape: (177, 27)
    Alexis features: ['AU01', 'AU02', 'AU04', 'AU05', 'AU06', 'AU07', 'AU09', 'AU10', 'AU12', 'AU14', 'AU15', 'AU17', 'AU20', 'AU23', 'AU25', 'AU26', 'AU28', 'left_hand_velocity', 'right_hand_velocity', 'gesture_frequency_cumulative', 'face_touches_cumulative', 'energy_db', 'pitch_hz', 'speaking_rate']
  Loading data for Nate in session NateAlexis
    Action Units: (193, 18)
    Hand Gestures: (28, 15)
    Audio Features: (8318, 12)
    Nate final shape: (119, 27)
    Nate features: ['AU01', 'AU02', 'AU04', 'AU05', 'AU06', 'AU07', 'AU09', 'AU10', 'AU12', 'AU14', 'AU15', 'AU17', 'AU20', 

In [4]:
ground_truth = {
    'session_person': [
        'NateAlexis_Alexis',
        'MarshallBritney_Britney',
        'ChaseGianna_Chase',
        'DaemahniGianna_Daemahni',
        'EliGianna_Eli',
        'ZahariahErin_Erin',
        'ChaseGianna_Gianna',
        'DaemahniGianna_Gianna',
        'EliGianna_Gianna',
        'StephenKeala_Keala',
        'MarshallBritney_Marshall',
        'StephenMiette_Miette',
        'NateAlexis_Nate',
        'SarahTexas_Sarah',
        'StephenKeala_Stephen',
        'StephenMiette_Stephen',
        'SarahTexas_Texas',
        'ZahariahErin_Zahariah'
    ],
    'is_attracted': [
        1,  # Alexis_on_NateAlexis
        1,  # Britney_on_MarshallBritney
        1,  # Chase_on_ChaseGianna
        1,  # Daemahni_on_DaemahniGianna
        1,  # Eli_on_EliGianna
        0,  # Erin_on_ZahariahErin
        0,  # Gianna_on_ChaseGianna
        1,  # Gianna_on_DaemahniGianna
        0,  # Gianna_on_EliGianna
        0,  # Keala_on_StephenKeala
        1,  # Marshall_on_MarshallBritney
        1,  # Miette_on_StephenMiette
        0,  # Nate_on_NateAlexis
        1,  # Sarah_on_SarahTexas
        0,  # Stephen_on_StephenKeala
        1,  # Stephen_on_StephenMiette
        1,  # Texas_on_SarahTexas
        1   # Zahariah_on_ZahariahErin
    ]
}

ground_truth_df = pd.DataFrame(ground_truth)
print("Ground Truth Labels:")
print(ground_truth_df)
print()

Ground Truth Labels:
              session_person  is_attracted
0          NateAlexis_Alexis             1
1    MarshallBritney_Britney             1
2          ChaseGianna_Chase             1
3    DaemahniGianna_Daemahni             1
4              EliGianna_Eli             1
5          ZahariahErin_Erin             0
6         ChaseGianna_Gianna             0
7      DaemahniGianna_Gianna             1
8           EliGianna_Gianna             0
9         StephenKeala_Keala             0
10  MarshallBritney_Marshall             1
11      StephenMiette_Miette             1
12           NateAlexis_Nate             0
13          SarahTexas_Sarah             1
14      StephenKeala_Stephen             0
15     StephenMiette_Stephen             1
16          SarahTexas_Texas             1
17     ZahariahErin_Zahariah             1



In [5]:
class MultimodalDataPreprocessor:
    """Handles data preprocessing and normalization for multimodal attraction data"""

    def __init__(self, sequence_length=15):
        self.sequence_length = sequence_length
        self.feature_scaler = StandardScaler()
        self.feature_names = None
        self.is_fitted = False
        
        # 1. HARDCODED CANONICAL FEATURE LIST
        self.canonical_feature_names = [
            'AU01', 'AU02', 'AU04', 'AU05', 'AU06', 'AU07', 'AU09', 'AU10', 'AU12', 'AU14', 
            'AU15', 'AU17', 'AU20', 'AU23', 'AU25', 'AU26', 'AU28', 'left_hand_velocity', 
            'right_hand_velocity', 'gesture_frequency_cumulative', 'face_touches_cumulative', 
            'energy_db', 'pitch_hz', 'speaking_rate', 'compound', 'pos', 'neu', 'neg'
        ]

    def create_sequences(self, all_data, ground_truth_df):
        """Create sequences for RNN training"""
        X_sequences = []
        y_labels = []
        sequence_info = []

        print("Creating sequences...")
        
        # Use the hardcoded canonical list
        self.feature_names = self.canonical_feature_names
        
        # Define columns to drop for clean feature data
        non_feature_cols = ['timestamp', 'person', 'session']


        for key, data in all_data.items():
            # Get label for this person
            label_row = ground_truth_df[ground_truth_df['session_person'] == key]
            if len(label_row) == 0:
                continue

            label = label_row['is_attracted'].iloc[0]

            # Remove non-feature columns
            # Use errors='ignore' in case some datasets don't have all non-feature columns
            feature_data = data.drop(non_feature_cols, axis=1, errors='ignore')

            # 2. ENFORCE THE CANONICAL FEATURE SET
            # Use .reindex() to ensure all 28 columns are present.
            # If a column is missing (like 'compound' in 24-feature data), it is added and filled with 0.0.
            feature_data = feature_data.reindex(columns=self.feature_names, fill_value=0.0)
            
            # --- Sanity Check (Optional but Recommended) ---
            # Now, every feature_data MUST have exactly len(self.feature_names) columns
            if feature_data.shape[1] != len(self.feature_names):
                 raise RuntimeError(f"Feature count mismatch for {key}. Expected {len(self.feature_names)}, got {feature_data.shape[1]}")
            # ---------------------------------------------


            # Create overlapping sequences
            for i in range(len(feature_data) - self.sequence_length + 1):
                # sequence will now consistently have shape (15, 28)
                sequence = feature_data.iloc[i:i + self.sequence_length].values
                X_sequences.append(sequence)
                y_labels.append(label)
                sequence_info.append({
                    'person': key,
                    'start_time': i,
                    'end_time': i + self.sequence_length - 1
                })

        # X = np.array(X_sequences) should now successfully create a 3D array of shape (N, 15, 28)
        X = np.array(X_sequences) 
        y = np.array(y_labels)

        print(f"Created {len(X)} sequences")
        print(f"Sequence shape: {X.shape}") # Should show (N, 15, 28)
        print(f"Features: {len(self.feature_names)}")
        print(f"Class distribution: {np.bincount(y)}")

        return X, y, sequence_info
        
    def fit_normalizer(self, X_train):
        """Fit the feature normalizer on training data"""
        print("Fitting feature normalizer...")

        # Reshape for normalization (samples*time, features)
        X_train_reshaped = X_train.reshape(-1, X_train.shape[-1])

        # Fit scaler
        self.feature_scaler.fit(X_train_reshaped)
        self.is_fitted = True

        print("Feature normalizer fitted!")
        return self

    def normalize_features(self, X):
        """Normalize features using fitted scaler"""
        if not self.is_fitted:
            raise ValueError("Normalizer not fitted! Call fit_normalizer() first.")

        # Reshape for normalization
        original_shape = X.shape
        X_reshaped = X.reshape(-1, X.shape[-1])

        # Transform
        X_normalized = self.feature_scaler.transform(X_reshaped)
        X_normalized = X_normalized.reshape(original_shape)

        return X_normalized

    def split_by_person(self, X, y, sequence_info, validation_split=0.2):
        """Split data by person to avoid data leakage"""
        print("Splitting data by person...")

        # Group sequences by person
        person_sequences = {}
        for i, info in enumerate(sequence_info):
            person = info['person']
            if person not in person_sequences:
                person_sequences[person] = []
            person_sequences[person].append(i)

        # Split by person
        train_indices = []
        val_indices = []

        for person, indices in person_sequences.items():
            n_val = max(1, int(len(indices) * validation_split))
            val_indices.extend(indices[-n_val:])  # Last sequences for validation
            train_indices.extend(indices[:-n_val])  # Rest for training

        X_train = X[train_indices]
        X_val = X[val_indices]
        y_train = y[train_indices]
        y_val = y[val_indices]

        print(f"Train set: {len(X_train)} sequences")
        print(f"Val set: {len(X_val)} sequences")
        print(f"Train class distribution: {np.bincount(y_train)}")
        print(f"Val class distribution: {np.bincount(y_val)}")

        return X_train, X_val, y_train, y_val, train_indices, val_indices

    def prepare_training_data(self, all_data, ground_truth_df, validation_split=0.2):
        """Complete data preparation pipeline"""
        # Create sequences
        X, y, sequence_info = self.create_sequences(all_data, ground_truth_df)

        # Split by person
        X_train, X_val, y_train, y_val, train_idx, val_idx = self.split_by_person(
            X, y, sequence_info, validation_split
        )

        # Fit normalizer on training data
        self.fit_normalizer(X_train)

        # Normalize both sets
        X_train_norm = self.normalize_features(X_train)
        X_val_norm = self.normalize_features(X_val)

        return {
            'X_train': X_train_norm,
            'X_val': X_val_norm,
            'y_train': y_train,
            'y_val': y_val,
            'train_indices': train_idx,
            'val_indices': val_idx,
            'sequence_info': sequence_info
        }

    def preprocess_new_data(self, person_data):
        """Preprocess new data for prediction"""
        if not self.is_fitted:
            raise ValueError("Preprocessor not fitted! Train model first.")

        # Remove non-feature columns
        feature_data = person_data.drop(['timestamp', 'person', 'session'], axis=1, errors='ignore')

        # Create sequences
        sequences = []
        for i in range(len(feature_data) - self.sequence_length + 1):
            sequence = feature_data.iloc[i:i + self.sequence_length].values
            sequences.append(sequence)

        if len(sequences) == 0:
            raise ValueError(f"Not enough data points. Need at least {self.sequence_length} time steps.")

        X = np.array(sequences)
        X_normalized = self.normalize_features(X)

        return X_normalized


In [6]:
# Initialize preprocessor
print("="*60)
print("DATA PREPROCESSING")
print("="*60)

preprocessor = MultimodalDataPreprocessor(sequence_length=15)
data_dict = preprocessor.prepare_training_data(all_data, ground_truth_df)

print(f"\nPreprocessing complete!")
print(f"Training data shape: {data_dict['X_train'].shape}")
print(f"Validation data shape: {data_dict['X_val'].shape}")
print(f"Feature names: {preprocessor.feature_names[:28]}")

DATA PREPROCESSING
Creating sequences...
Created 1535 sequences
Sequence shape: (1535, 15, 28)
Features: 28
Class distribution: [ 436 1099]
Splitting data by person...
Train set: 1235 sequences
Val set: 300 sequences
Train class distribution: [350 885]
Val class distribution: [ 86 214]
Fitting feature normalizer...
Feature normalizer fitted!

Preprocessing complete!
Training data shape: (1235, 15, 28)
Validation data shape: (300, 15, 28)
Feature names: ['AU01', 'AU02', 'AU04', 'AU05', 'AU06', 'AU07', 'AU09', 'AU10', 'AU12', 'AU14', 'AU15', 'AU17', 'AU20', 'AU23', 'AU25', 'AU26', 'AU28', 'left_hand_velocity', 'right_hand_velocity', 'gesture_frequency_cumulative', 'face_touches_cumulative', 'energy_db', 'pitch_hz', 'speaking_rate', 'compound', 'pos', 'neu', 'neg']


In [7]:
X_train = data_dict['X_train']  # Shape: (n_train, 15, 28)
X_val = data_dict['X_val']      # Shape: (n_val, 15, 28)
y_train = data_dict['y_train']
y_val = data_dict['y_val']

# Average across time dimension (axis=1) to get (n_samples, 28)
X_train_averaged = np.mean(X_train, axis=1)  # Shape: (n_train, 28)
X_val_averaged = np.mean(X_val, axis=1)      # Shape: (n_val, 28)

print(f"X_train_averaged shape: {X_train_averaged.shape}")
print(f"X_val_averaged shape:  {X_val_averaged.shape}")


X_train_averaged shape: (1235, 28)
X_val_averaged shape:  (300, 28)


In [8]:
import pandas as pd
X_train_df = pd.DataFrame(X_train_averaged, columns=preprocessor.feature_names)
X_val_df = pd.DataFrame(X_val_averaged, columns=preprocessor.feature_names)

print("\nTraining DataFrame:")
print(X_train_df.head())
print(f"\nDataFrame shape: {X_train_df.shape}")



Training DataFrame:
       AU01      AU02      AU04      AU05      AU06      AU07      AU09  \
0  0.832833  0.757558 -0.239359  0.540323  0.059784 -0.245942 -0.120319   
1  0.472470  0.444433 -0.239527  0.359324  0.137621 -0.246094 -0.120320   
2  0.666614  0.647726 -0.232520  0.581379  0.155783 -0.239736 -0.120309   
3  0.779392  0.770140 -0.229595  0.725925  0.232981 -0.237083 -0.120307   
4  0.953304  0.959157 -0.227899  0.949713  0.242518 -0.235544 -0.120307   

       AU10      AU12      AU14  ...  right_hand_velocity  \
0 -0.241729  0.059784  0.059784  ...            -0.359705   
1 -0.241862  0.137621  0.137621  ...            -0.356292   
2 -0.236266  0.155783  0.155783  ...            -0.352879   
3 -0.233932  0.232981  0.232981  ...            -0.349466   
4 -0.232579  0.242518  0.242518  ...            -0.349466   

   gesture_frequency_cumulative  face_touches_cumulative  energy_db  pitch_hz  \
0                     -0.063813                -0.435722   0.473676 -0.067573   

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Logistic Regression Baseline
baseline = LogisticRegression(max_iter=1000, class_weight='balanced')
baseline.fit(X_train_averaged, y_train)
y_pred = baseline.predict(X_val_averaged)

# Calculate metrics
baseline_acc = accuracy_score(y_val, y_pred)
baseline_prec = precision_score(y_val, y_pred)
baseline_recall = recall_score(y_val, y_pred)




BASELINE MODEL RESULTS
Baseline Accuracy:   0.7467


ValueError: Format specifier missing precision

In [10]:
print(f"\n{'='*60}")
print("BASELINE MODEL RESULTS")
print(f"{'='*60}")
print(f"Baseline Accuracy:   {baseline_acc:.4f}")
print(f"Baseline Precision: {baseline_prec:.4f}")
print(f"Baseline Recall:    {baseline_recall:.4f}")


BASELINE MODEL RESULTS
Baseline Accuracy:   0.7467
Baseline Precision: 0.9157
Baseline Recall:    0.7103


In [11]:
from sklearn.metrics import classification_report, confusion_matrix

print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=['Not Attracted', 'Attracted']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))


Classification Report:
               precision    recall  f1-score   support

Not Attracted       0.54      0.84      0.65        86
    Attracted       0.92      0.71      0.80       214

     accuracy                           0.75       300
    macro avg       0.73      0.77      0.73       300
 weighted avg       0.81      0.75      0.76       300


Confusion Matrix:
[[ 72  14]
 [ 62 152]]
