# FER Data Preparation - Process and Save to Drive
This notebook handles all data preprocessing steps and saves the final dataset to Google Drive for use in training.

**Output:** Processed images and balanced dataset JSON saved to `/content/drive/MyDrive/processed_data/`

In [None]:
from google.colab import drive
import os
import cv2
import numpy as np
import pandas as pd
from PIL import Image
from google.colab.patches import cv2_imshow

# Mount Google Drive
drive.mount('/content/drive')

# Install Mediapipe for high-accuracy facial landmarks
!pip install mediapipe
import mediapipe as mp

In [None]:
import sys
!{sys.executable} -m pip install mediapipe

# This forced refresh tells the notebook to look at the folders again
import site
from importlib import reload
reload(site)

In [None]:
import cv2
import numpy as np
import os
from google.colab.patches import cv2_imshow

class SimpleRAFPreprocessor:
    def __init__(self, output_size=(336, 336)):
        self.output_size = output_size
        # Load OpenCV's pre-trained face and eye detectors
        self.face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
        self.eye_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_eye.xml')

    def align_and_crop(self, image):
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        faces = self.face_cascade.detectMultiScale(gray, 1.3, 5)

        if len(faces) == 0:
            return cv2.resize(image, self.output_size)

        # Take the largest face found
        (x, y, w, h) = sorted(faces, key=lambda f: f[2]*f[3], reverse=True)[0]
        roi_gray = gray[y:y+h, x:x+w]
        roi_color = image[y:y+h, x:x+w]

        # Detect eyes within the face ROI
        eyes = self.eye_cascade.detectMultiScale(roi_gray)

        if len(eyes) >= 2:
            # Sort eyes by x-coordinate to identify left vs right
            eyes = sorted(eyes, key=lambda e: e[0])
            left_eye_center = (eyes[0][0] + eyes[0][2]//2, eyes[0][1] + eyes[0][3]//2)
            right_eye_center = (eyes[1][0] + eyes[1][2]//2, eyes[1][1] + eyes[1][3]//2)

            # Calculate angle
            dY = right_eye_center[1] - left_eye_center[1]
            dX = right_eye_center[0] - left_eye_center[0]
            angle = np.degrees(np.arctan2(dY, dX))

            # Rotate
            center = (int(w / 2), int(h / 2))
            M = cv2.getRotationMatrix2D(center, angle, 1.0)
            rotated = cv2.warpAffine(roi_color, M, (w, h))
            return cv2.resize(rotated, self.output_size)

        # Fallback: Just crop the face box if eyes aren't found
        face_crop = image[y:y+h, x:x+w]
        return cv2.resize(face_crop, self.output_size)

# --- Initialize ---
preprocessor = SimpleRAFPreprocessor()
print("‚úÖ Preprocessor initialized using OpenCV (Failsafe Mode).")

In [None]:
import os

# Define the base paths for your shortcuts
base_paths = {
    "Compound": "/content/drive/MyDrive/compound",
    "RAF-AU": "/content/drive/MyDrive/RAF-AU",
    "RAF-ML": "/content/drive/MyDrive/RAF-ML"
}

def inspect_data(base_dirs):
    print("üîç --- RAF Dataset Inspection --- üîç\n")
    for name, path in base_dirs.items():
        print(f"üìÅ Folder: {name} ({path})")
        if not os.path.exists(path):
            print(f"   ‚ùå Path does not exist. Check shortcut name/location.")
            continue

        # List first 5 items to see directory structure
        try:
            items = os.listdir(path)
            print(f"   Items found: {len(items)}")
            print(f"   Sample contents: {items[:5]}")

            # Check for common subfolders like 'Image' or 'Annotation'
            for sub in items:
                sub_path = os.path.join(path, sub)
                if os.path.isdir(sub_path):
                    sub_items = os.listdir(sub_path)
                    print(f"     ‚îî‚îÄ üìÇ {sub}/ ({len(sub_items)} items)")
                    if sub_items:
                        print(f"        ‚îî‚îÄ Sample: {sub_items[0]}")
        except Exception as e:
            print(f"   ‚ö†Ô∏è Error reading folder: {e}")
        print("-" * 40)

inspect_data(base_paths)

In [None]:
import zipfile
import os

# Define where to extract
extract_path = '/content/raf_data_unzipped'
os.makedirs(extract_path, exist_ok=True)

# List of zip files to extract based on your inspection
zips_to_extract = [
    ('/content/drive/MyDrive/RAF-AU/aligned.zip', 'RAF-AU-aligned'),
    ('/content/drive/MyDrive/RAF-ML/Image/aligned.zip', 'RAF-ML-aligned'),
    # Add paths for Compound zips if they appeared in your 'Image' folders
]

for zip_path, folder_name in zips_to_extract:
    if os.path.exists(zip_path):
        print(f"üì¶ Unzipping {zip_path}...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(os.path.join(extract_path, folder_name))
    else:
        print(f"‚ö†Ô∏è Could not find {zip_path}")

print("‚úÖ Unzipping complete.")

In [None]:
print("--- Compound/ML Label Preview ---")
!head -n 5 /content/drive/MyDrive/RAF-ML/EmoLabel/partition_label.txt

print("\n--- RAF-AU Label Preview ---")
!head -n 5 /content/drive/MyDrive/RAF-AU/RAFAU_label.txt

In [None]:
import zipfile
import os

# Create local directories
!mkdir -p /content/raf_images
!mkdir -p /content/raf_annotations

# Unzip the Action Unit images and the Annotation zips from your Drive shortcuts
zips = {
    '/content/drive/MyDrive/RAF-AU/aligned.zip': '/content/raf_images/',
    '/content/drive/MyDrive/RAF-ML/Annotation/manual.zip': '/content/raf_annotations/'
}

for zip_path, target in zips.items():
    if os.path.exists(zip_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(target)
            print(f"‚úÖ Extracted {zip_path}")

In [None]:
import json

EMO_PATH = '/content/drive/MyDrive/RAF-ML/EmoLabel/multilabel.txt'  # Multi-label emotions (6D)
AU_PATH = '/content/drive/MyDrive/RAF-AU/RAFAU_label.txt'

# Emotion names mapping
EMOTION_NAMES = ['Surprise', 'Fear', 'Disgust', 'Happiness', 'Sadness', 'Anger']

def create_dataset_json():
    # 1. Load AUs into a dictionary
    au_map = {}
    with open(AU_PATH, 'r') as f:
        for line in f:
            parts = line.strip().split()  # Use split() without args to handle multiple spaces
            if len(parts) >= 2:
                au_map[parts[0]] = parts[1]  # e.g., {'0001.jpg': '1+4+25'}

    # 2. Load Multi-label Emotions and Merge
    final_data = []
    with open(EMO_PATH, 'r') as f:
        for line in f:
            parts = line.strip().split()  # Use split() without args to handle multiple spaces
            
            if len(parts) < 7:  # Need img_id + 6 emotion dimensions
                continue
            
            img_id = parts[0]
            
            # Parse the 6-dimensional emotion vector (skip empty strings)
            try:
                emotion_vector = [int(parts[i]) for i in range(1, 7)]
            except (ValueError, IndexError):
                print(f"‚ö†Ô∏è Skipping {img_id}: Invalid emotion data - {parts[1:7]}")
                continue
            
            # Only include if we have AU data
            au_val = au_map.get(img_id, "null")
            if au_val != "null":
                # Identify which emotions are present
                present_emotions = [EMOTION_NAMES[i] for i in range(6) if emotion_vector[i] == 1]
                emotion_label = ', '.join(present_emotions) if present_emotions else 'Neutral'
                
                entry = {
                    "id": img_id,
                    "image": f"aligned_faces/{img_id}",
                    "emotion_vector": emotion_vector,  # [Surprise, Fear, Disgust, Happiness, Sadness, Anger]
                    "emotions_present": present_emotions,
                    "conversations": [
                        {
                            "from": "human",
                            "value": "<image>\nIdentify the emotions present and list the facial Action Units (AUs) involved."
                        },
                        {
                            "from": "gpt",
                            "value": f"This face exhibits {emotion_label}. The observed facial cues correspond to Action Units: {au_val}."
                        }
                    ]
                }
                final_data.append(entry)

    with open('dataset_vision_llm.json', 'w') as f:
        json.dump(final_data, f, indent=2)
    print(f"üéâ Success! Created dataset_vision_llm.json with {len(final_data)} entries.")
    print(f"Each image now has multi-label emotion identification.")

create_dataset_json()

In [None]:
import json
import collections
import matplotlib.pyplot as plt

# Load the dataset you just created
with open('dataset_vision_llm.json', 'r') as f:
    data = json.load(f)

print(f"Total Dataset Entries: {len(data)}\n")

# Analyze emotion distribution
EMOTION_NAMES = ['Surprise', 'Fear', 'Disgust', 'Happiness', 'Sadness', 'Anger']
emotion_counts = {emotion: 0 for emotion in EMOTION_NAMES}

for entry in data:
    for emotion in entry['emotions_present']:
        emotion_counts[emotion] += 1

# Also count multi-emotion images
single_emotion = sum(1 for entry in data if len(entry['emotions_present']) == 1)
multi_emotion = sum(1 for entry in data if len(entry['emotions_present']) > 1)
neutral = sum(1 for entry in data if len(entry['emotions_present']) == 0)

print("Emotion Distribution:")
print("-" * 50)
for emotion, count in emotion_counts.items():
    percentage = (count / len(data)) * 100
    print(f"{emotion:12} : {count:4} occurrences ({percentage:5.1f}%)")

print("\n" + "-" * 50)
print(f"Single Emotion Images  : {single_emotion}")
print(f"Multi-Emotion Images   : {multi_emotion}")
print(f"Neutral/None           : {neutral}")
print("-" * 50)

# Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart of emotion frequencies
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8', '#F7DC6F']
ax1.bar(emotion_counts.keys(), emotion_counts.values(), color=colors)
ax1.set_title('Emotion Distribution in RAF-ML', fontsize=14, fontweight='bold')
ax1.set_xlabel('Emotions')
ax1.set_ylabel('Frequency')
ax1.tick_params(axis='x', rotation=45)
ax1.grid(axis='y', alpha=0.3)

# Pie chart of emotion complexity
complexity = [single_emotion, multi_emotion, neutral]
labels = [f'Single\n({single_emotion})', f'Multi-label\n({multi_emotion})', f'Neutral\n({neutral})']
colors_pie = ['#3498db', '#e74c3c', '#95a5a6']
ax2.pie(complexity, labels=labels, autopct='%1.1f%%', colors=colors_pie, startangle=90)
ax2.set_title('Emotion Complexity Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

# Sample entries
print("\nüì∏ Sample Entries:")
print("=" * 70)
for i in range(min(3, len(data))):
    print(f"\nImage: {data[i]['id']}")
    print(f"Emotions: {data[i]['emotions_present']}")
    print(f"Emotion Vector: {data[i]['emotion_vector']}")
    print(f"Response: {data[i]['conversations'][1]['value']}")

In [None]:
import albumentations as A
import cv2

# Define the transformations required by your plan
transform = A.Compose([
    A.Rotate(limit=20, p=0.5),             # Rotations
    A.RandomBrightnessContrast(p=0.5),    # Lighting adjustments
    A.CoarseDropout(max_holes=1, max_height=40, max_width=40, p=0.3) # Light occlusions
])

# Example of how to apply to a minority class image
def augment_image(image_path):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    augmented = transform(image=image)['image']
    return augmented

In [None]:
import random

def balance_dataset(json_data):
    # Separate the classes
    class_0 = [d for d in json_data if d['conversations'][1]['value'].split('label is ')[1].startswith('0')]
    class_1 = [d for d in json_data if d['conversations'][1]['value'].split('label is ')[1].startswith('1')]

    # Simple Oversampling: Duplicate Class 1 entries to close the gap
    # In a real pipeline, these would be the 'augmented' versions
    shortfall = len(class_0) - len(class_1)
    extra_class_1 = random.choices(class_1, k=shortfall)

    balanced_data = class_0 + class_1 + extra_class_1
    random.shuffle(balanced_data)

    print(f"New Dataset Size: {len(balanced_data)}")
    print(f"Class 0: {len(class_0)} | Class 1: {len(class_1) + shortfall}")
    return balanced_data

# Use your existing data
# balanced_json = balance_dataset(data)

In [None]:
import albumentations as A
import cv2

# Updated to use the correct argument names for current Albumentations versions
transform = A.Compose([
    A.Rotate(limit=20, p=0.5),             # Rotations (Requirement 4)
    A.RandomBrightnessContrast(p=0.5),    # Lighting adjustments (Requirement 4)
    A.CoarseDropout(
        num_holes_range=(1, 1),
        hole_height_range=(20, 40),
        hole_width_range=(20, 40),
        p=0.3
    ) # Light occlusions (Requirement 4)
])

In [None]:
import zipfile
import os
import cv2

# 1. Extract images from your Drive shortcut
zip_path = '/content/drive/MyDrive/RAF-AU/aligned.zip'
extract_to = '/content/temp_raw_images/'
os.makedirs(extract_to, exist_ok=True)

if os.path.exists(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print("‚úÖ Step A: Images extracted from ZIP.")
else:
    print("‚ùå Error: ZIP file not found. Check your Drive shortcut.")

# 2. Process them into the aligned_faces folder
PROCESSED_DIR = '/content/aligned_faces/'
os.makedirs(PROCESSED_DIR, exist_ok=True)

# Find the folder inside the zip (it often creates a subfolder)
raw_folder = extract_to
for root, dirs, files in os.walk(extract_to):
    if any(f.endswith('.jpg') for f in files):
        raw_folder = root
        break

all_imgs = [f for f in os.listdir(raw_folder) if f.endswith('.jpg')]
print(f"‚úÖ Step B: Found {len(all_imgs)} images. Starting alignment...")

for img_name in all_imgs[:500]: # Processing first 500 for a quick test
    img = cv2.imread(os.path.join(raw_folder, img_name))
    if img is not None:
        # Using your OpenCV preprocessor
        aligned_face = preprocessor.align_and_crop(img)
        cv2.imwrite(os.path.join(PROCESSED_DIR, img_name), aligned_face)

print(f"‚úÖ Step C: {len(os.listdir(PROCESSED_DIR))} faces ready in /content/aligned_faces/")

In [None]:
import matplotlib.pyplot as plt
import random
import cv2
import os

processed_images = [f for f in os.listdir('/content/aligned_faces/') if f.endswith('.jpg')]
sample = random.sample(processed_images, 4)

plt.figure(figsize=(12, 6))
for i, img_name in enumerate(sample):
    img = cv2.imread(f'/content/aligned_faces/{img_name}')
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.subplot(1, 4, i+1)
    plt.imshow(img)
    plt.title(f"Aligned: {img_name}")
    plt.axis('off')
plt.show()

In [None]:
import json
with open('dataset_vision_llm.json', 'r') as f:
    data = json.load(f)

print(f"Total Linked Entries: {len(data)}")
print("Sample Mapping Structure:")
print(json.dumps(data[0], indent=2))

In [None]:
# Final check to process EVERY image in your JSON mapping
processed_count = 0
for entry in data:
    img_id = entry['id']
    target_path = os.path.join(PROCESSED_DIR, img_id)

    # Only process if the file doesn't already exist in the aligned folder
    if not os.path.exists(target_path):
        source_path = os.path.join(raw_folder, img_id)
        img = cv2.imread(source_path)
        if img is not None:
            aligned_face = preprocessor.align_and_crop(img)
            cv2.imwrite(target_path, aligned_face)
            processed_count += 1

print(f"‚úÖ Cleanup complete. Total images in aligned folder: {len(os.listdir(PROCESSED_DIR))}")

In [None]:
# Find the actual directory containing the .jpg files
!find /content/temp_raw_images/ -name "*.jpg" | head -n 1

In [None]:
import os
import cv2
import json

# 1. Correct Paths
SOURCE_FOLDER = '/content/temp_raw_images/aligned/'
PROCESSED_DIR = '/content/aligned_faces/'
os.makedirs(PROCESSED_DIR, exist_ok=True)

# 2. Process all images in that folder
all_files = [f for f in os.listdir(SOURCE_FOLDER) if f.endswith('.jpg')]
print(f"üìÇ Found {len(all_files)} source images. Starting final alignment...")

processed_filenames = set()
for img_name in all_files:
    img = cv2.imread(os.path.join(SOURCE_FOLDER, img_name))
    if img is not None:
        # Using your preprocessor with the int-fix
        aligned = preprocessor.align_and_crop(img)
        cv2.imwrite(os.path.join(PROCESSED_DIR, img_name), aligned)
        processed_filenames.add(img_name)

print(f"‚úÖ {len(processed_filenames)} images processed into {PROCESSED_DIR}")

# 3. Update the JSON Mapping to match the new filenames
with open('dataset_vision_llm.json', 'r') as f:
    data = json.load(f)

updated_data = []
for entry in data:
    # Logic: 0001.jpg in JSON becomes 0001_aligned.jpg on disk
    original_id = entry['id'].replace('.jpg', '')
    new_id = f"{original_id}_aligned.jpg"

    if new_id in processed_filenames:
        entry['id'] = new_id
        entry['image'] = f"aligned_faces/{new_id}"
        updated_data.append(entry)

with open('dataset_vision_llm_final.json', 'w') as f:
    json.dump(updated_data, f, indent=2)

print(f"üéâ FINAL JSON CREATED: 'dataset_vision_llm_final.json' with {len(updated_data)} verified links.")

In [None]:
import os
import json
import cv2
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

def audit_phase_one(json_path, image_dir):
    print("üîç --- COUCHE 1: CRITICAL AUDIT START (Multi-Label) --- üîç\n")

    # 1. JSON & LINKAGE CHECK
    if not os.path.exists(json_path):
        print("‚ùå ERROR: JSON mapping file missing.")
        return

    with open(json_path, 'r') as f:
        data = json.load(f)

    total_expected = len(data)

    # 2. PHYSICAL FILE CHECK
    existing_files = [f for f in os.listdir(image_dir) if f.endswith('.jpg')]
    found_count = len(existing_files)

    # 3. DISTRIBUTION CHECK (Requirement: Distribution des classes multi-label)
    EMOTION_NAMES = ['Surprise', 'Fear', 'Disgust', 'Happiness', 'Sadness', 'Anger']
    emotion_counts = {emotion: 0 for emotion in EMOTION_NAMES}
    single_emotion = 0
    multi_emotion = 0
    
    for entry in data:
        if len(entry['emotions_present']) == 1:
            single_emotion += 1
        elif len(entry['emotions_present']) > 1:
            multi_emotion += 1
        for emotion in entry['emotions_present']:
            emotion_counts[emotion] += 1

    # 4. NORMALIZATION & CROPPING CHECK (Requirement: D√©tection et recadrage)
    sample_img_path = os.path.join(image_dir, existing_files[0])
    sample_img = cv2.imread(sample_img_path)
    height, width = sample_img.shape[:2]

    # 5. DATA AUGMENTATION SIMULATION (Requirement: Eclairage, Rotations, Occlusions)
    print("üõ†Ô∏è Testing Augmentation Robustness...")
    try:
        # Test a rotation and occlusion on a sample to ensure logic is ready
        rows, cols = sample_img.shape[:2]
        M = cv2.getRotationMatrix2D((cols/2, rows/2), 15, 1) # 15 degree test
        aug_test = cv2.warpAffine(sample_img, M, (cols, rows))
        cv2.rectangle(aug_test, (20, 20), (60, 60), (0,0,0), -1) # Occlusion test
        aug_success = True
    except:
        aug_success = False

    # --- REPORTING ---
    print(f"{'Requirement':<30} | {'Status':<15} | {'Details'}")
    print("-" * 75)
    print(f"{'Mapping Integrity':<30} | {'‚úÖ PASS' if total_expected == found_count else '‚ùå FAIL':<15} | {found_count}/{total_expected} linked")
    print(f"{'Face Normalization':<30} | {'‚úÖ PASS' if height == width else '‚ö†Ô∏è WARN':<15} | Size: {width}x{height}")
    print(f"{'Emotion Complexity':<30} | {'‚úÖ DONE':<15} | S:{single_emotion} M:{multi_emotion}")
    print(f"{'Emotion Distribution':<30} | {'‚úÖ DONE':<15} | {dict(emotion_counts)}")
    print(f"{'Augmentation Logic':<30} | {'‚úÖ READY' if aug_success else '‚ùå FAIL':<15} | Rot/Occ test passed")

    # Visual Confirmation
    print("\nüì∏ Displaying 3 random aligned samples for visual inspection...")
    plt.figure(figsize=(12, 3))
    for i in range(min(3, len(existing_files))):
        idx = i if i < len(data) else 0
        img = cv2.imread(os.path.join(image_dir, existing_files[i]))
        plt.subplot(1, 3, i+1)
        plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        emotions_str = ', '.join(data[idx]['emotions_present']) if data[idx]['emotions_present'] else 'Neutral'
        plt.title(f"Emotions: {emotions_str}")
        plt.axis('off')
    plt.show()

# Run the audit
audit_phase_one('dataset_vision_llm_final.json', '/content/aligned_faces/')

In [None]:
'''import sys
!{sys.executable} -m pip install -U albumentations -q

# Clean up sys.modules to force fresh import
modules_to_remove = [key for key in sys.modules.keys() if key.startswith('torch')]
for module in modules_to_remove:
    del sys.modules[module]

# Reinstall packages to ensure clean state
print("üîÑ Reinstalling torch and dependencies...")
!pip uninstall -y torch torchvision torchaudio -q
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 -q
!pip install -U albumentations -q

print("\n‚úÖ Packages reinstalled successfully")

# Import torch first to establish proper module hierarchy
import torch
print(f"‚úÖ PyTorch version: {torch.__version__}")
print(f"‚úÖ CUDA available: {torch.cuda.is_available()}")

# Now import albumentations
import albumentations as A
print(f"‚úÖ Albumentations version: {A.__version__}")

import cv2
import json
import os

print("\n‚úÖ All imports successful - ready to proceed with augmentation")'''

In [None]:
import random
import cv2
import json
import os
import shutil

EMOTION_NAMES = ['Surprise', 'Fear', 'Disgust', 'Happiness', 'Sadness', 'Anger']

print("üîÑ Preparing final balanced dataset and saving to Drive...\n")

# Define Drive paths
DATA_DIR = '/content/drive/MyDrive/processed_data'
IMAGES_DIR = os.path.join(DATA_DIR, 'aligned_faces')
DATASET_JSON = os.path.join(DATA_DIR, 'dataset_vision_llm_balanced.json')

# Clean ALL existing data (Drive directory + local JSON)
if os.path.exists(DATA_DIR):
    print(f"üóëÔ∏è  Removing existing Drive directory (includes old JSON + images)...")
    shutil.rmtree(DATA_DIR)
    print(f"‚úÖ Old Drive data cleaned")

# Also clean old local balanced JSON if exists
if os.path.exists(DATASET_JSON):
    print(f"üóëÔ∏è  Removing old local balanced JSON file...")
    os.remove(DATASET_JSON)
    print(f"‚úÖ Old local JSON cleaned")

# Create fresh directories
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(IMAGES_DIR, exist_ok=True)
print(f"‚úÖ Created fresh Drive directories:")
print(f"   - {DATA_DIR}")
print(f"   - {IMAGES_DIR}")

# Load the preprocessed data
with open('dataset_vision_llm_final.json', 'r') as f:
    final_data = json.load(f)

print(f"\nüìä Dataset Summary:")
print(f"   - Total entries: {len(final_data)}")

# Convert to simplified Q&A format for final balanced dataset
def convert_to_qa_format(entry):
    """Convert original format to question-answer format"""
    # Extract emotion label from conversations
    gpt_response = entry['conversations'][1]['value']
    
    # Extract emotion and AU info
    emotion_label = ', '.join(entry['emotions_present']) if entry['emotions_present'] else 'Neutral'
    
    # Get AU value from the gpt response
    au_val = 'null'
    if 'Action Units:' in gpt_response:
        au_val = gpt_response.split('Action Units: ')[-1].rstrip('.')
    
    return {
        "id": entry['id'],
        "image": f"aligned_faces/{entry['id']}",
        "question": "Identify the emotions present and list the facial Action Units (AUs) involved.",
        "answer": f"Emotion: {emotion_label}. Action Units: {au_val}"
    }

# Convert all entries
converted_data = [convert_to_qa_format(entry) for entry in final_data]

# Copy aligned images to Drive
print(f"\nüìÇ Copying aligned images to Drive...")
copied_count = 0
for entry in final_data:
    img_id = entry['id']
    src_path = os.path.join(PROCESSED_DIR, img_id)
    dst_path = os.path.join(IMAGES_DIR, img_id)
    
    if os.path.exists(src_path):
        shutil.copy2(src_path, dst_path)
        copied_count += 1
        if copied_count % 100 == 0:
            print(f"   Copied {copied_count} images...")

print(f"‚úÖ Copied {copied_count} images to {IMAGES_DIR}")

# Save final JSON to Drive
with open(DATASET_JSON, 'w') as f:
    json.dump(converted_data, f, indent=2)

print(f"\n‚úÖ Saved dataset JSON to {DATASET_JSON}")
print(f"   - Converted entries: {len(converted_data)}")

# Display sample entries
print("\nüì∏ Sample Entries from Balanced Dataset (Q&A Format):")
print("=" * 80)
for i in range(min(3, len(converted_data))):
    print(f"\nEntry {i+1}:")
    print(f"  ID: {converted_data[i]['id']}")
    print(f"  Image: {converted_data[i]['image']}")
    print(f"  Question: {converted_data[i]['question']}")
    print(f"  Answer: {converted_data[i]['answer']}")
print("=" * 80)

print("\nüéâ ALL DATA SAVED TO GOOGLE DRIVE!")
print(f"üìÅ Data Location: {DATA_DIR}")
print(f"   - Images: {IMAGES_DIR}/ ({copied_count} files)")
print(f"   - Dataset: dataset_vision_llm_balanced.json ({len(converted_data)} entries)")
print(f"\n‚ö†Ô∏è  Note: Old Drive directory (with JSON + images) was completely removed and replaced")