### EEG feature generation

In [1]:
import os
import pandas as pd
import numpy as np
from datetime import timedelta
from scipy.signal import welch

# --- STEP 1: Set Working Directory ---
BASE_DIR = '/Users/adityashah/Documents/PhD/PhD/data'
os.chdir(BASE_DIR)
print(f"✅ Working directory set to: {os.getcwd()}")

# --- STEP 2: Get Participant Folders ---
EEG_DIRS = [f for f in os.listdir(BASE_DIR) if f.startswith("P") and os.path.isdir(f)]
print(f"📁 Found {len(EEG_DIRS)} participant folders: {EEG_DIRS}\n")

# --- STEP 3: Settings ---
SAMPLING_RATE = 128
WINDOW_SIZE = 5  # seconds

# --- STEP 4: Output directory ---
OUTPUT_DIR = os.path.join(BASE_DIR, 'preprocessed_data')
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"🔧 Saving all preprocessed EEG features to: {OUTPUT_DIR}\n")

# --- Band columns of interest ---
def get_band_cols(df):
    return [col for col in df.columns if any(b in col for b in ['Delta', 'Theta', 'Alpha', 'Beta', 'Gamma'])]

def extract_features(window, band_cols):
    features = {}
    for col in band_cols:
        signal = pd.to_numeric(window[col], errors='coerce').dropna()
        if signal.empty:
            continue
        features[f'{col}_mean'] = signal.mean()
        features[f'{col}_std'] = signal.std()
        features[f'{col}_min'] = signal.min()
        features[f'{col}_max'] = signal.max()
    return features

# --- STEP 5: Process each participant ---
for participant in sorted(EEG_DIRS):
    print(f"\n📦 Processing participant: {participant}")
    eeg_path = os.path.join(BASE_DIR, participant, f'{participant}_eeg.csv')
    ann_path = os.path.join(BASE_DIR, participant, f'{participant}_annotation.csv')
    out_path = os.path.join(OUTPUT_DIR, f'{participant}_eeg_features.csv')

    try:
        if not os.path.exists(eeg_path):
            print(f"❌ Missing EEG file: {eeg_path}")
            continue
        if not os.path.exists(ann_path):
            print(f"❌ Missing annotation file: {ann_path}")
            continue

        print(f"📄 Reading EEG file: {eeg_path}")
        print(f"📄 Reading annotation file: {ann_path}")
        eeg_df = pd.read_csv(eeg_path)
        ann_df = pd.read_csv(ann_path)

        eeg_df = eeg_df[pd.to_numeric(eeg_df['RAW_TP9'], errors='coerce').notnull()].copy()
        eeg_df['TimeStamp'] = pd.to_datetime(eeg_df['TimeStamp'])

        band_cols = get_band_cols(eeg_df)
        if not band_cols:
            print(f"⚠️  No EEG band columns found for {participant}")
            continue

        eeg_start = eeg_df['TimeStamp'].iloc[0]
        ann_df['abs_time'] = [eeg_start + timedelta(seconds=int(t)) for t in ann_df['Timestamp']]

        emotion_labels = ['neutral', 'anger', 'disgust', 'fear', 'happiness', 'sadness', 'surprise']
        records = []

        for i, row in ann_df.iterrows():
            start_time = row['abs_time']
            end_time = start_time + timedelta(seconds=WINDOW_SIZE)
            window = eeg_df[(eeg_df['TimeStamp'] >= start_time) & (eeg_df['TimeStamp'] < end_time)]

            if len(window) == 0:
                print(f"⚠️  Empty EEG segment from {start_time} to {end_time} — skipped")
                continue

            features = extract_features(window, band_cols)
            emotion = next((emo for emo in emotion_labels if row.get(emo, 0) == 1), 'unknown')

            features.update({
                'participant': participant,
                'segment_start': start_time,
                'emotion': emotion,
                'valence': row['Valence'],
                'arousal': row['Arousal']
            })
            records.append(features)

        if records:
            out_df = pd.DataFrame(records)
            out_df.to_csv(out_path, index=False)
            print(f"✅ Saved features: {out_path}")
        else:
            print(f"⚠️  No valid EEG segments found for {participant}")

    except Exception as e:
        print(f"❌ Error processing {participant}: {e}")


✅ Working directory set to: /Users/adityashah/Documents/PhD/PhD/data
📁 Found 16 participant folders: ['P4', 'P3', 'P2', 'P5', 'P10', 'P11', 'P16', 'P7', 'P9', 'P8', 'P6', 'P1', 'P13', 'P14', 'P15', 'P12']

🔧 Saving all preprocessed EEG features to: /Users/adityashah/Documents/PhD/PhD/data/preprocessed_data


📦 Processing participant: P1
📄 Reading EEG file: /Users/adityashah/Documents/PhD/PhD/data/P1/P1_eeg.csv
📄 Reading annotation file: /Users/adityashah/Documents/PhD/PhD/data/P1/P1_annotation.csv
✅ Saved features: /Users/adityashah/Documents/PhD/PhD/data/preprocessed_data/P1_eeg_features.csv

📦 Processing participant: P10
📄 Reading EEG file: /Users/adityashah/Documents/PhD/PhD/data/P10/P10_eeg.csv
📄 Reading annotation file: /Users/adityashah/Documents/PhD/PhD/data/P10/P10_annotation.csv
✅ Saved features: /Users/adityashah/Documents/PhD/PhD/data/preprocessed_data/P10_eeg_features.csv

📦 Processing participant: P11
📄 Reading EEG file: /Users/adityashah/Documents/PhD/PhD/data/P11/P11_eeg

#### Audio Features

In [3]:
import os
import librosa
import numpy as np
import pandas as pd
from moviepy import VideoFileClip
from datetime import timedelta

# --- Config ---
BASE_DIR = '/Users/adityashah/Documents/PhD/PhD/data'
OUTPUT_DIR = os.path.join(BASE_DIR, 'preprocessed_data')
os.makedirs(OUTPUT_DIR, exist_ok=True)

PARTICIPANTS = sorted([d for d in os.listdir(BASE_DIR) if d.startswith('P') and os.path.isdir(os.path.join(BASE_DIR, d))])
print(f"🎯 Found {len(PARTICIPANTS)} participant folders.")

for pid in PARTICIPANTS:
    print(f"\n📦 Processing: {pid}")
    folder = os.path.join(BASE_DIR, pid)

    # Locate video
    video_file = next((f for f in os.listdir(folder) if f.endswith('.mp4')), None)
    if video_file is None:
        print(f"⚠️  No .mp4 video found in {folder} — skipping.")
        continue
    video_path = os.path.join(folder, video_file)
    print(f"🎥 Found video: {video_path}")

    # Annotation file
    ann_file = f"{pid}_annotation.csv"
    ann_path = os.path.join(folder, ann_file)
    if not os.path.exists(ann_path):
        print(f"⚠️  Missing annotation file: {ann_path} — skipping.")
        continue

    # Step 1: Extract audio to temp file
    audio_temp_path = f"{pid}_temp_audio.wav"
    try:
        clip = VideoFileClip(video_path)
        clip.audio.write_audiofile(audio_temp_path)
    except Exception as e:
        print(f"❌ Failed to extract audio: {e}")
        continue

    # Step 2: Load audio and annotation
    try:
        y, sr = librosa.load(audio_temp_path, sr=None)
        duration = librosa.get_duration(y=y, sr=sr)
        ann_df = pd.read_csv(ann_path)

        print(f"🎧 Audio loaded: {duration:.2f}s, sample rate: {sr} Hz")
        print(f"🧾 Annotation rows: {len(ann_df)}")

        records = []
        for i, row in ann_df.iterrows():
            start_sec = int(row["Timestamp"])
            end_sec = start_sec + 5

            if end_sec * sr > len(y):
                print(f"⚠️  Skipping segment {start_sec}-{end_sec}s (beyond audio)")
                continue

            y_seg = y[start_sec * sr : end_sec * sr]

            # Feature extraction
            mfcc = librosa.feature.mfcc(y=y_seg, sr=sr, n_mfcc=13)
            mfcc_mean = mfcc.mean(axis=1)
            mfcc_std = mfcc.std(axis=1)
            rms = librosa.feature.rms(y=y_seg).mean()
            zcr = librosa.feature.zero_crossing_rate(y_seg).mean()

            feat = {f"mfcc_{i}_mean": mfcc_mean[i] for i in range(13)}
            feat.update({f"mfcc_{i}_std": mfcc_std[i] for i in range(13)})
            feat["rms"] = rms
            feat["zcr"] = zcr

            # Emotion label
            emotion_labels = ['neutral', 'anger', 'disgust', 'fear', 'happiness', 'sadness', 'surprise']
            emotion = next((e for e in emotion_labels if row.get(e, 0) == 1), 'unknown')

            feat.update({
                "participant": pid,
                "segment_start": clip.start + start_sec,
                "emotion": emotion,
                "valence": row["Valence"],
                "arousal": row["Arousal"]
            })

            records.append(feat)

        if records:
            out_df = pd.DataFrame(records)
            out_path = os.path.join(OUTPUT_DIR, f"{pid}_audio_features.csv")
            out_df.to_csv(out_path, index=False)
            print(f"✅ Saved: {out_path}")
        else:
            print(f"⚠️  No valid segments extracted.")

    except Exception as e:
        print(f"❌ Error during processing: {e}")

    finally:
        # Cleanup
        if os.path.exists(audio_temp_path):
            os.remove(audio_temp_path)


🎯 Found 16 participant folders.

📦 Processing: P1
🎥 Found video: /Users/adityashah/Documents/PhD/PhD/data/P1/WIN_20220105_16_15_58_Pro.mp4
MoviePy - Writing audio in P1_temp_audio.wav


                                                                        

MoviePy - Done.
🎧 Audio loaded: 1013.92s, sample rate: 44100 Hz
🧾 Annotation rows: 202
⚠️  Skipping segment 1010-1015s (beyond audio)
✅ Saved: /Users/adityashah/Documents/PhD/PhD/data/preprocessed_data/P1_audio_features.csv

📦 Processing: P10
🎥 Found video: /Users/adityashah/Documents/PhD/PhD/data/P10/WIN_20220105_14_56_11_Pro.mp4
MoviePy - Writing audio in P10_temp_audio.wav


                                                                        

MoviePy - Done.
🎧 Audio loaded: 631.71s, sample rate: 44100 Hz
🧾 Annotation rows: 126
⚠️  Skipping segment 630-635s (beyond audio)
✅ Saved: /Users/adityashah/Documents/PhD/PhD/data/preprocessed_data/P10_audio_features.csv

📦 Processing: P11
🎥 Found video: /Users/adityashah/Documents/PhD/PhD/data/P11/WIN_20220209_11_08_09_Pro.mp4
MoviePy - Writing audio in P11_temp_audio.wav


                                                                       

MoviePy - Done.
🎧 Audio loaded: 456.32s, sample rate: 44100 Hz
🧾 Annotation rows: 90
✅ Saved: /Users/adityashah/Documents/PhD/PhD/data/preprocessed_data/P11_audio_features.csv

📦 Processing: P12
🎥 Found video: /Users/adityashah/Documents/PhD/PhD/data/P12/WIN_20220210_17_04_55_Pro.mp4
MoviePy - Writing audio in P12_temp_audio.wav


                                                                         

MoviePy - Done.
🎧 Audio loaded: 649.10s, sample rate: 44100 Hz
🧾 Annotation rows: 130
⚠️  Skipping segment 645-650s (beyond audio)
⚠️  Skipping segment 650-655s (beyond audio)
✅ Saved: /Users/adityashah/Documents/PhD/PhD/data/preprocessed_data/P12_audio_features.csv

📦 Processing: P13
🎥 Found video: /Users/adityashah/Documents/PhD/PhD/data/P13/WIN_20220104_13_08_03_Pro.mp4
MoviePy - Writing audio in P13_temp_audio.wav


                                                                        

MoviePy - Done.
🎧 Audio loaded: 674.86s, sample rate: 44100 Hz
🧾 Annotation rows: 135
⚠️  Skipping segment 670-675s (beyond audio)
⚠️  Skipping segment 675-680s (beyond audio)
✅ Saved: /Users/adityashah/Documents/PhD/PhD/data/preprocessed_data/P13_audio_features.csv

📦 Processing: P14
🎥 Found video: /Users/adityashah/Documents/PhD/PhD/data/P14/WIN_20220111_13_16_04_Pro.mp4
MoviePy - Writing audio in P14_temp_audio.wav


                                                                        

MoviePy - Done.
🎧 Audio loaded: 503.43s, sample rate: 44100 Hz
🧾 Annotation rows: 100
⚠️  Skipping segment 500-505s (beyond audio)
✅ Saved: /Users/adityashah/Documents/PhD/PhD/data/preprocessed_data/P14_audio_features.csv

📦 Processing: P15
🎥 Found video: /Users/adityashah/Documents/PhD/PhD/data/P15/WIN_20220210_16_40_58_Pro.mp4
MoviePy - Writing audio in P15_temp_audio.wav


                                                                        

MoviePy - Done.
🎧 Audio loaded: 695.90s, sample rate: 44100 Hz
🧾 Annotation rows: 139
⚠️  Skipping segment 695-700s (beyond audio)
✅ Saved: /Users/adityashah/Documents/PhD/PhD/data/preprocessed_data/P15_audio_features.csv

📦 Processing: P16
🎥 Found video: /Users/adityashah/Documents/PhD/PhD/data/P16/WIN_20220105_15_48_56_Pro.mp4
MoviePy - Writing audio in P16_temp_audio.wav


                                                                        

MoviePy - Done.
🎧 Audio loaded: 755.22s, sample rate: 44100 Hz
🧾 Annotation rows: 151
⚠️  Skipping segment 755-760s (beyond audio)
✅ Saved: /Users/adityashah/Documents/PhD/PhD/data/preprocessed_data/P16_audio_features.csv

📦 Processing: P2
🎥 Found video: /Users/adityashah/Documents/PhD/PhD/data/P2/WIN_20220105_14_24_49_Pro.mp4
MoviePy - Writing audio in P2_temp_audio.wav


                                                                        

MoviePy - Done.
🎧 Audio loaded: 703.02s, sample rate: 44100 Hz
🧾 Annotation rows: 141
⚠️  Skipping segment 700-705s (beyond audio)
⚠️  Skipping segment 705-710s (beyond audio)
✅ Saved: /Users/adityashah/Documents/PhD/PhD/data/preprocessed_data/P2_audio_features.csv

📦 Processing: P3
🎥 Found video: /Users/adityashah/Documents/PhD/PhD/data/P3/WIN_20220104_14_43_55_Pro.mp4
MoviePy - Writing audio in P3_temp_audio.wav


                                                                        

MoviePy - Done.
🎧 Audio loaded: 645.23s, sample rate: 44100 Hz
🧾 Annotation rows: 129
⚠️  Skipping segment 645-650s (beyond audio)
✅ Saved: /Users/adityashah/Documents/PhD/PhD/data/preprocessed_data/P3_audio_features.csv

📦 Processing: P4
🎥 Found video: /Users/adityashah/Documents/PhD/PhD/data/P4/WIN_20220210_16_24_04_Pro.mp4
MoviePy - Writing audio in P4_temp_audio.wav


                                                                        

MoviePy - Done.
🎧 Audio loaded: 597.97s, sample rate: 44100 Hz
🧾 Annotation rows: 125
⚠️  Skipping segment 595-600s (beyond audio)
⚠️  Skipping segment 600-605s (beyond audio)
⚠️  Skipping segment 605-610s (beyond audio)
⚠️  Skipping segment 610-615s (beyond audio)
⚠️  Skipping segment 615-620s (beyond audio)
⚠️  Skipping segment 620-625s (beyond audio)
⚠️  Skipping segment 625-630s (beyond audio)
✅ Saved: /Users/adityashah/Documents/PhD/PhD/data/preprocessed_data/P4_audio_features.csv

📦 Processing: P5
🎥 Found video: /Users/adityashah/Documents/PhD/PhD/data/P5/WIN_20220107_15_35_24_Pro.mp4
MoviePy - Writing audio in P5_temp_audio.wav


                                                                        

MoviePy - Done.
🎧 Audio loaded: 608.08s, sample rate: 44100 Hz
🧾 Annotation rows: 125
⚠️  Skipping segment 605-610s (beyond audio)
⚠️  Skipping segment 610-615s (beyond audio)
⚠️  Skipping segment 615-620s (beyond audio)
⚠️  Skipping segment 620-625s (beyond audio)
⚠️  Skipping segment 625-630s (beyond audio)
✅ Saved: /Users/adityashah/Documents/PhD/PhD/data/preprocessed_data/P5_audio_features.csv

📦 Processing: P6
🎥 Found video: /Users/adityashah/Documents/PhD/PhD/data/P6/WIN_20220107_11_24_35_Pro.mp4
MoviePy - Writing audio in P6_temp_audio.wav


                                                                        

MoviePy - Done.
🎧 Audio loaded: 614.72s, sample rate: 44100 Hz
🧾 Annotation rows: 125
⚠️  Skipping segment 610-615s (beyond audio)
⚠️  Skipping segment 615-620s (beyond audio)
⚠️  Skipping segment 620-625s (beyond audio)
⚠️  Skipping segment 625-630s (beyond audio)
✅ Saved: /Users/adityashah/Documents/PhD/PhD/data/preprocessed_data/P6_audio_features.csv

📦 Processing: P7
🎥 Found video: /Users/adityashah/Documents/PhD/PhD/data/P7/WIN_20220107_12_47_55_Pro.mp4
MoviePy - Writing audio in P7_temp_audio.wav


                                                                         

MoviePy - Done.
🎧 Audio loaded: 626.54s, sample rate: 44100 Hz
🧾 Annotation rows: 125
⚠️  Skipping segment 625-630s (beyond audio)
✅ Saved: /Users/adityashah/Documents/PhD/PhD/data/preprocessed_data/P7_audio_features.csv

📦 Processing: P8
🎥 Found video: /Users/adityashah/Documents/PhD/PhD/data/P8/WIN_20220210_10_17_52_Pro.mp4
MoviePy - Writing audio in P8_temp_audio.wav


                                                                        

MoviePy - Done.
🎧 Audio loaded: 461.81s, sample rate: 44100 Hz
🧾 Annotation rows: 92
⚠️  Skipping segment 460-465s (beyond audio)
✅ Saved: /Users/adityashah/Documents/PhD/PhD/data/preprocessed_data/P8_audio_features.csv

📦 Processing: P9
🎥 Found video: /Users/adityashah/Documents/PhD/PhD/data/P9/WIN_20220104_16_14_11_Pro.mp4
MoviePy - Writing audio in P9_temp_audio.wav


                                                                         

MoviePy - Done.
🎧 Audio loaded: 619.77s, sample rate: 44100 Hz
🧾 Annotation rows: 125
⚠️  Skipping segment 615-620s (beyond audio)
⚠️  Skipping segment 620-625s (beyond audio)
⚠️  Skipping segment 625-630s (beyond audio)
✅ Saved: /Users/adityashah/Documents/PhD/PhD/data/preprocessed_data/P9_audio_features.csv


#### Video features

In [2]:
import os
import cv2
import cvlib as cv
import numpy as np
import pandas as pd
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.models import Model

# --- Config ---
BASE_DIR = '/Users/adityashah/Documents/PhD/PhD/data'
OUTPUT_DIR = os.path.join(BASE_DIR, 'preprocessed_data')
os.makedirs(OUTPUT_DIR, exist_ok=True)

PARTICIPANTS = sorted([p for p in os.listdir(BASE_DIR) if p.startswith('P') and os.path.isdir(os.path.join(BASE_DIR, p))])
print(f"📦 Found {len(PARTICIPANTS)} participant folders")

# --- Load ResNet50 (no classification layer) ---
print("⏳ Loading ResNet50 model (ImageNet pretrained)...")
base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
print("✅ ResNet50 model loaded.\n")

# --- Process each participant ---
for pid in PARTICIPANTS:
    print("🔄" + "="*50)
    print(f"▶️ Starting processing for: {pid}")

    folder = os.path.join(BASE_DIR, pid)
    video_file = next((f for f in os.listdir(folder) if f.endswith('.mp4')), None)
    ann_file = f"{pid}_annotation.csv"
    ann_path = os.path.join(folder, ann_file)

    if not video_file:
        print(f"⚠️  No .mp4 file found in {pid} — skipping.\n")
        continue
    if not os.path.exists(ann_path):
        print(f"⚠️  Missing annotation file: {ann_path} — skipping.\n")
        continue

    video_path = os.path.join(folder, video_file)
    print(f"🎥 Video file: {video_path}")
    print(f"🧾 Annotation file: {ann_path}")

    try:
        ann_df = pd.read_csv(ann_path)
        total_segments = len(ann_df)
        print(f"🔢 Annotation segments: {total_segments}")

        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)
        frame_interval = int(fps * 5)
        print(f"🎞️ Video FPS: {fps:.2f}, Frame interval for 5s: {frame_interval} frames")

        records = []
        frame_count = 0
        ann_index = 0
        face_skips = 0

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret or ann_index >= total_segments:
                break

            if frame_count % frame_interval == 0:
                row = ann_df.iloc[ann_index]
                faces, confidences = cv.detect_face(frame)

                if faces:
                    x1, y1, x2, y2 = faces[0]
                    face_crop = frame[y1:y2, x1:x2]
                    face_crop = cv2.resize(face_crop, (224, 224))
                    face_crop = img_to_array(face_crop)
                    face_crop = np.expand_dims(face_crop, axis=0)
                    face_crop = preprocess_input(face_crop)

                    features = base_model.predict(face_crop, verbose=0)[0]

                    emotion_labels = ['neutral', 'anger', 'disgust', 'fear', 'happiness', 'sadness', 'surprise']
                    emotion = next((e for e in emotion_labels if row.get(e, 0) == 1), 'unknown')

                    record = {f'resnet_{i}': features[i] for i in range(len(features))}
                    record.update({
                        "participant": pid,
                        "segment_start": ann_index * 5,
                        "emotion": emotion,
                        "valence": row["Valence"],
                        "arousal": row["Arousal"]
                    })
                    records.append(record)

                    print(f"✅ [{pid}] Segment {ann_index*5}s — Face detected, features extracted.")
                else:
                    print(f"⚠️ [{pid}] Segment {ann_index*5}s — No face detected. Skipping.")
                    face_skips += 1

                ann_index += 1

            frame_count += 1

        cap.release()

        if records:
            out_df = pd.DataFrame(records)
            out_path = os.path.join(OUTPUT_DIR, f"{pid}_video_features.csv")
            out_df.to_csv(out_path, index=False)
            print(f"\n📁 Saved features: {out_path}")
            print(f"📊 Segments processed: {len(records)} / {total_segments}")
            if face_skips > 0:
                print(f"🙈 Segments skipped (no face): {face_skips}")
        else:
            print(f"⚠️ No features extracted for {pid}.")

    except Exception as e:
        print(f"❌ Error while processing {pid}: {e}")

print("\n🏁 All participants processed.")


📦 Found 16 participant folders
⏳ Loading ResNet50 model (ImageNet pretrained)...
✅ ResNet50 model loaded.

▶️ Starting processing for: P1
🎥 Video file: /Users/adityashah/Documents/PhD/PhD/data/P1/WIN_20220105_16_15_58_Pro.mp4
🧾 Annotation file: /Users/adityashah/Documents/PhD/PhD/data/P1/P1_annotation.csv
🔢 Annotation segments: 202
🎞️ Video FPS: 19.91, Frame interval for 5s: 99 frames
✅ [P1] Segment 0s — Face detected, features extracted.
✅ [P1] Segment 5s — Face detected, features extracted.
✅ [P1] Segment 10s — Face detected, features extracted.
✅ [P1] Segment 15s — Face detected, features extracted.
✅ [P1] Segment 20s — Face detected, features extracted.
✅ [P1] Segment 25s — Face detected, features extracted.
✅ [P1] Segment 30s — Face detected, features extracted.
✅ [P1] Segment 35s — Face detected, features extracted.
✅ [P1] Segment 40s — Face detected, features extracted.
✅ [P1] Segment 45s — Face detected, features extracted.
✅ [P1] Segment 50s — Face detected, features extracted