In [None]:
# Install all required libraries for the image pipeline
# Run this cell first before running any other cells

print("Installing required libraries...")
print("="*60)

# Core image processing libraries
!pip install -q pillow pillow-heif
print("✓ Installed: Pillow, pillow-heif (for HEIC/HEIF support)")

# Scientific computing
!pip install -q numpy pandas
print("✓ Installed: NumPy, Pandas")

# Image processing and computer vision
!pip install -q scikit-image
print("✓ Installed: scikit-image (for HOG features)")

# Visualization
!pip install -q matplotlib
print("✓ Installed: Matplotlib")

# Google Drive API (for Colab)
try:
    from google.colab import drive
    print("✓ Google Colab libraries available")
except:
    print("⚠ Not running in Google Colab - Drive mounting may not work")

print("\n" + "="*60)
print("All libraries installed successfully!")
print("="*60)
print("\nYou can now proceed to the next cells.")


# Image Pipeline

This notebook visualizes sample images, augmentations, and HOG features, and then exports features to `data/processed/image_features.csv`.

Instructions:
- Place your images in the `images/` folder at the project root (e.g., `member1_neutral.jpg`, `member1_smile.jpg`, `member1_surprised.jpg`).
- Run the cells below to preview and generate features.


In [None]:
# Mount Google Drive and access images from SHARED folder
from google.colab import drive
import shutil
import os

# Mount Google Drive
drive.mount('/content/drive')

# Google Drive folder ID from the link
FOLDER_ID = '1nTIBD3R7RbhgOIv4bw8EtJTBOB2rcQUA'

print("="*60)
print("ACCESSING SHARED FOLDER FROM GOOGLE DRIVE")
print("="*60)

# Method 1: Use Drive API to access folder by ID (BEST FOR SHARED FOLDERS)
print("\n[Method 1] Using Drive API to access folder by ID...")
try:
    from google.colab import auth
    from googleapiclient.discovery import build
    from googleapiclient.http import MediaIoBaseDownload
    import io
    
    # Authenticate and create Drive service
    auth.authenticate_user()
    drive_service = build('drive', 'v3')
    
    # Get folder metadata
    folder_metadata = drive_service.files().get(fileId=FOLDER_ID, fields='name, id').execute()
    folder_name = folder_metadata.get('name', 'Unknown')
    print(f"✓ Found folder: '{folder_name}' (ID: {FOLDER_ID})")
    
    # List all files in the folder
    query = f"'{FOLDER_ID}' in parents and trashed=false"
    results = drive_service.files().list(q=query, fields="files(id, name, mimeType)").execute()
    files = results.get('files', [])
    
    if files:
        print(f"✓ Found {len(files)} items in folder")
        
        # Create images directory
        BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
        IMAGES_DIR = os.path.join(BASE_DIR, "images")
        os.makedirs(IMAGES_DIR, exist_ok=True)
        
        # Download image files
        image_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.heic', '.heif', '.JPG', '.JPEG', '.PNG', '.HEIC', '.HEIF')
        image_mimes = ('image/jpeg', 'image/png', 'image/bmp', 'image/heic', 'image/heif')
        copied_count = 0
        
        for file in files:
            file_name = file.get('name', '')
            file_id = file.get('id', '')
            file_mime = file.get('mimeType', '')
            
            # Check if it's an image file
            is_image = (any(file_name.lower().endswith(ext.lower()) for ext in image_extensions) or 
                       file_mime.startswith('image/'))
            
            if is_image:
                try:
                    # Download the file
                    request = drive_service.files().get_media(fileId=file_id)
                    file_path = os.path.join(IMAGES_DIR, file_name)
                    
                    with open(file_path, 'wb') as fh:
                        downloader = MediaIoBaseDownload(fh, request)
                        done = False
                        while not done:
                            status, done = downloader.next_chunk()
                    
                    copied_count += 1
                    print(f"  ✓ Downloaded: {file_name}")
                except Exception as e:
                    print(f"  ✗ Error downloading {file_name}: {str(e)}")
        
        print(f"\n✓ Total images downloaded: {copied_count}")
        print(f"✓ Images are now available in: {IMAGES_DIR}")
        folder_path = "API_SUCCESS"
        
    else:
        print("✗ No files found in the folder")
        folder_path = None
        
except Exception as e:
    print(f"✗ Drive API method failed: {str(e)}")
    print("Trying file system search method...")
    folder_path = None

# Method 2: File system search (if API method failed)
if not folder_path or folder_path != "API_SUCCESS":
    print("\n[Method 2] Searching file system for shared folder...")
    
    # Check "Shared with me" directory
    shared_path = '/content/drive/MyDrive/Shared with me'
    
    if os.path.exists(shared_path):
        print(f"✓ Found 'Shared with me' directory")
        print("\nListing folders in 'Shared with me':")
        
        try:
            items = os.listdir(shared_path)
            folders_found = []
            for item in items:
                item_path = os.path.join(shared_path, item)
                if os.path.isdir(item_path):
                    folders_found.append(item)
                    print(f"  [FOLDER] {item}")
            
            if folders_found:
                print(f"\nFound {len(folders_found)} folder(s).")
                print("Please check if one of these is your target folder.")
                print("\nTrying to find folder by searching for image files...")
                
                # Search for folders containing images
                for folder_name in folders_found:
                    folder_path_candidate = os.path.join(shared_path, folder_name)
                    # Check if this folder contains images
                    image_count = 0
                    for root, dirs, files in os.walk(folder_path_candidate):
                        for f in files:
                            if any(f.lower().endswith(ext.lower()) for ext in ('.jpg', '.jpeg', '.png', '.bmp', '.heic', '.heif')):
                                image_count += 1
                    
                    if image_count > 0:
                        print(f"\n✓ Found folder with images: '{folder_name}' ({image_count} images)")
                        folder_path = folder_path_candidate
                        break
            else:
                print("No folders found in 'Shared with me'")
                
        except Exception as e:
            print(f"Error listing shared folders: {str(e)}")
    else:
        print(f"✗ 'Shared with me' directory not found at: {shared_path}")
        print("The folder might need to be added to 'My Drive' first.")
    
    # If folder found via file system, copy images
    if folder_path and folder_path != "API_SUCCESS" and os.path.exists(folder_path):
        print(f"\n✓ Using folder at: {folder_path}")
        
        # Create images directory
        BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
        IMAGES_DIR = os.path.join(BASE_DIR, "images")
        os.makedirs(IMAGES_DIR, exist_ok=True)
        
        # Copy images
        image_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.heic', '.heif', '.JPG', '.JPEG', '.PNG', '.HEIC', '.HEIF')
        copied_count = 0
        
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                if any(file.lower().endswith(ext.lower()) for ext in image_extensions):
                    src_path = os.path.join(root, file)
                    dst_path = os.path.join(IMAGES_DIR, file)
                    try:
                        shutil.copy2(src_path, dst_path)
                        copied_count += 1
                        print(f"  Copied: {file}")
                    except Exception as e:
                        print(f"  Error copying {file}: {str(e)}")
        
        print(f"\n✓ Total images copied: {copied_count}")
        print(f"✓ Images are now available in: {IMAGES_DIR}")
    
    elif not folder_path or folder_path != "API_SUCCESS":
        print("\n" + "="*60)
        print("FOLDER NOT FOUND - MANUAL INSTRUCTIONS")
        print("="*60)
        print("\nPlease try one of these options:")
        print("\n1. ADD FOLDER TO MY DRIVE (Easiest):")
        print("   - Go to Google Drive in your browser")
        print("   - Open the shared folder")
        print("   - Right-click the folder → 'Add shortcut to Drive'")
        print("   - Then run this cell again")
        print("\n2. MANUAL PATH:")
        print("   - After mounting, find the folder path manually")
        print("   - Update the code below with the exact path")
        print("\n3. Check if folder is accessible:")
        print("   - Make sure you're signed in to the correct Google account")
        print("   - Verify you have access to the shared folder")




In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image, ImageOps
import pillow_heif
from skimage.feature import hog
from skimage.color import rgb2gray

BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
IMAGES_DIR = os.path.join(BASE_DIR, "images")
print("Images directory:", IMAGES_DIR)

# Register HEIF opener for HEIC support
pillow_heif.register_heif_opener()

# Helper functions

def load_image(path, size=(224, 224)):
    img = Image.open(path).convert("RGB")
    if size:
        img = img.resize(size)
    return img


def augmentations(img):
    return {
        "original": img,
        "rotate_15": img.rotate(15),
        "rotate_-15": img.rotate(-15),
        "flip_h": ImageOps.mirror(img),
        "flip_v": ImageOps.flip(img),
        "grayscale": ImageOps.grayscale(img).convert("RGB"),
    }


def show_grid(images_dict):
    keys = list(images_dict.keys())
    n = len(keys)
    cols = min(3, n)
    rows = int(np.ceil(n / cols))
    fig, axes = plt.subplots(rows, cols, figsize=(4 * cols, 4 * rows))
    axes = np.array(axes).reshape(-1)
    for ax, k in zip(axes, keys):
        ax.imshow(images_dict[k])
        ax.set_title(k)
        ax.axis("off")
    for ax in axes[len(keys):]:
        ax.axis("off")
    plt.tight_layout()
    plt.show()

# Load and display images for ALL members
all_image_paths = []
for root, _, files in os.walk(IMAGES_DIR):
    for f in files:
        if f.lower().endswith((".jpg", ".jpeg", ".png", ".bmp", ".heic", ".heif")):
            all_image_paths.append(os.path.join(root, f))

if all_image_paths:
    print(f"Found {len(all_image_paths)} images")
    
    # Group images by member (assuming naming: memberX_expression.jpg)
    from collections import defaultdict
    member_images = defaultdict(list)
    
    for img_path in all_image_paths:
        filename = os.path.basename(img_path)
        # Extract member name (everything before last underscore)
        name_parts = os.path.splitext(filename)[0].split('_')
        if len(name_parts) >= 2:
            member = '_'.join(name_parts[:-1])
            expression = name_parts[-1]
        else:
            member = "unknown"
            expression = "unknown"
        member_images[member].append((img_path, expression, filename))
    
    print(f"\nFound images for {len(member_images)} member(s):")
    for member, images in member_images.items():
        print(f"  {member}: {len(images)} images")
        for _, expr, fname in images:
            print(f"    - {expr}: {fname}")
    
    # Display original images for each member
    print("\n" + "="*60)
    print("DISPLAYING ORIGINAL IMAGES FOR ALL MEMBERS")
    print("="*60)
    
    for member, images in sorted(member_images.items()):
        print(f"\n--- {member.upper()} ---")
        # Sort by expression: neutral, smile, surprised
        expr_order = {'neutral': 0, 'smile': 1, 'smiling': 1, 'surprised': 2, 'surprise': 2}
        images_sorted = sorted(images, key=lambda x: expr_order.get(x[1].lower(), 99))
        
        fig, axes = plt.subplots(1, len(images_sorted), figsize=(5*len(images_sorted), 5))
        if len(images_sorted) == 1:
            axes = [axes]
        
        for ax, (img_path, expression, filename) in zip(axes, images_sorted):
            img = load_image(img_path)
            ax.imshow(img)
            ax.set_title(f"{expression}\n{filename}", fontsize=10)
            ax.axis("off")
        
        plt.tight_layout()
        plt.show()
    
    # Display augmentations for a sample image from each member
    print("\n" + "="*60)
    print("DISPLAYING AUGMENTATIONS FOR EACH MEMBER")
    print("="*60)
    
    for member, images in sorted(member_images.items()):
        if images:
            # Use the first image for each member to show augmentations
            sample_path = images[0][0]
            print(f"\n--- Augmentations for {member.upper()} (using {images[0][2]}) ---")
            img = load_image(sample_path)
            show_grid(augmentations(img))
    
    # Store sample_paths for HOG visualization (use first image)
    sample_paths = all_image_paths
    img = load_image(sample_paths[0])  # For HOG visualization
    
else:
    print("No images found yet. Please add images to:", IMAGES_DIR)
    print("\nExpected naming convention:")
    print("  - member1_neutral.jpg")
    print("  - member1_smile.jpg")
    print("  - member1_surprised.jpg")
    print("  - member2_neutral.jpg")
    print("  - etc.")


In [None]:
# HOG visualization on the sample image
if sample_paths:
    from skimage import exposure

    gray = rgb2gray(np.array(img))
    features, hog_image = hog(
        gray,
        orientations=9,
        pixels_per_cell=(16, 16),
        cells_per_block=(2, 2),
        block_norm="L2-Hys",
        visualize=True,
        transform_sqrt=True,
        feature_vector=True,
    )
    hog_image_rescaled = exposure.rescale_intensity(hog_image, in_range=(0, 10))

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4))
    ax1.imshow(img)
    ax1.set_title("Original")
    ax1.axis("off")

    ax2.imshow(hog_image_rescaled, cmap="gray")
    ax2.set_title("HOG")
    ax2.axis("off")
    plt.tight_layout()
    plt.show()
else:
    print("Add images to visualize HOG.")


In [None]:
# Setup: Get Scripts from GitHub Repository
# This cell downloads scripts from your GitHub repository: https://github.com/Emmanuel-kwizera/ML-Pipeline-formative-2

import os
import sys
import subprocess
import urllib.request

BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
SCRIPTS_DIR = os.path.join(BASE_DIR, "scripts")
os.makedirs(SCRIPTS_DIR, exist_ok=True)

GITHUB_REPO = "https://github.com/Emmanuel-kwizera/ML-Pipeline-formative-2"
GITHUB_RAW = "https://raw.githubusercontent.com/Emmanuel-kwizera/ML-Pipeline-formative-2/main"

print("="*60)
print("SETTING UP SCRIPTS FROM GITHUB")
print("="*60)
print(f"Repository: {GITHUB_REPO}")
print(f"Scripts directory: {SCRIPTS_DIR}")

# List of required scripts with their GitHub paths
required_scripts = {
    'image_processing.py': 'scripts/image_processing.py',
    'facial_recognition_model.py': 'scripts/facial_recognition_model.py',
    'facial_recognition_predict.py': 'scripts/facial_recognition_predict.py',
    'test_unauthorized.py': 'scripts/test_unauthorized.py'
}

scripts_downloaded = []
scripts_found_locally = []
scripts_failed = []

# First, check what's already available locally
print("\n[Step 1] Checking local scripts...")
for script_name in required_scripts.keys():
    local_path = os.path.join(SCRIPTS_DIR, script_name)
    if os.path.exists(local_path):
        size = os.path.getsize(local_path)
        print(f"  ✓ {script_name} - Found locally ({size} bytes)")
        scripts_found_locally.append(script_name)

# Method 1: Try cloning the repository (gets all files at once)
if len(scripts_found_locally) < len(required_scripts):
    print("\n[Step 2] Attempting to clone repository...")
    repo_dir = os.path.join(BASE_DIR, "ML-Pipeline-formative-2")
    
    try:
        if os.path.exists(repo_dir):
            print(f"  Repository already exists at: {repo_dir}")
            # Update it
            result = subprocess.run(
                ['git', '-C', repo_dir, 'pull'],
                capture_output=True,
                text=True,
                timeout=30
            )
            if result.returncode == 0:
                print("  ✓ Repository updated")
        else:
            # Clone repository
            result = subprocess.run(
                ['git', 'clone', GITHUB_REPO, repo_dir],
                capture_output=True,
                text=True,
                timeout=60
            )
            if result.returncode == 0:
                print(f"  ✓ Repository cloned successfully")
            else:
                print(f"  ✗ Git clone failed: {result.stderr}")
                raise Exception("Git clone failed")
        
        # Copy scripts from cloned repo
        repo_scripts = os.path.join(repo_dir, "scripts")
        if os.path.exists(repo_scripts):
            import shutil
            for script_name, _ in required_scripts.items():
                if script_name not in scripts_found_locally:
                    src = os.path.join(repo_scripts, script_name)
                    if os.path.exists(src):
                        dst = os.path.join(SCRIPTS_DIR, script_name)
                        shutil.copy2(src, dst)
                        print(f"  ✓ Copied {script_name}")
                        scripts_downloaded.append(script_name)
    except Exception as e:
        print(f"  ✗ Clone method failed: {str(e)}")
        print("  Trying direct download method...")

# Method 2: Download scripts directly from GitHub (works without git)
missing_scripts = [s for s in required_scripts.keys() 
                   if s not in scripts_found_locally and s not in scripts_downloaded]

if missing_scripts:
    print(f"\n[Step 3] Downloading {len(missing_scripts)} script(s) from GitHub...")
    
    for script_name, github_path in required_scripts.items():
        if script_name in scripts_found_locally or script_name in scripts_downloaded:
            continue
        
        local_path = os.path.join(SCRIPTS_DIR, script_name)
        
        try:
            # Download from GitHub raw content
            url = f"{GITHUB_RAW}/{github_path}"
            print(f"  Downloading {script_name}...")
            
            urllib.request.urlretrieve(url, local_path)
            
            if os.path.exists(local_path) and os.path.getsize(local_path) > 0:
                size = os.path.getsize(local_path)
                print(f"    ✓ Downloaded {script_name} ({size} bytes)")
                scripts_downloaded.append(script_name)
            else:
                print(f"    ✗ Download failed: {script_name} (file is empty)")
                scripts_failed.append(script_name)
        except urllib.error.HTTPError as e:
            if e.code == 404:
                print(f"    ✗ {script_name} not found in repository (404)")
                print(f"      → Script needs to be committed to GitHub")
            else:
                print(f"    ✗ Failed: {script_name} (HTTP {e.code})")
            scripts_failed.append(script_name)
        except Exception as e:
            print(f"    ✗ Failed to download {script_name}: {str(e)}")
            scripts_failed.append(script_name)

# Summary
print("\n" + "="*60)
print("SETUP SUMMARY")
print("="*60)
print(f"✓ Scripts found locally: {len(scripts_found_locally)}")
print(f"✓ Scripts downloaded: {len(scripts_downloaded)}")
print(f"✗ Scripts failed: {len(scripts_failed)}")

if scripts_found_locally:
    print("\nLocal scripts:")
    for script in scripts_found_locally:
        print(f"  - {script}")

if scripts_downloaded:
    print("\nDownloaded scripts:")
    for script in scripts_downloaded:
        print(f"  - {script}")

if scripts_failed:
    print("\n⚠ Missing scripts:")
    for script in scripts_failed:
        print(f"  - {script}")
    print("\nTo fix this:")
    print("1. Push scripts to GitHub repository")
    print("2. Or upload them manually to Colab")
    print(f"3. Or ensure they exist at: {SCRIPTS_DIR}")

# Final verification
print("\n" + "-"*60)
print("FINAL VERIFICATION")
print("-"*60)
all_ready = True
for script_name in required_scripts.keys():
    script_path = os.path.join(SCRIPTS_DIR, script_name)
    if os.path.exists(script_path):
        size = os.path.getsize(script_path)
        print(f"✓ {script_name} - Ready ({size} bytes)")
    else:
        print(f"✗ {script_name} - MISSING")
        all_ready = False

if all_ready:
    print("\n✓ All scripts are ready to use!")
else:
    print("\n⚠ Some scripts are missing. Please:")
    print(f"  1. Push scripts to: {GITHUB_REPO}")
    print(f"  2. Or place them in: {SCRIPTS_DIR}")



In [None]:
def parse_filename(filename):
    """Parse filename to extract member name and expression
    Handles multiple formats:
    - memberX_neutral.jpg
    - memberX-neutral.jpg
    - Honorine-neutral.HEIC
    - Charlotte Kariza Suprised Pic .jpeg
    - Emmanuel Kwizera 2.jpg
    """
    name = os.path.splitext(filename)[0]
    
    # Define expression keywords (case insensitive)
    expression_keywords = {
        'neutral': ['neutral', 'neutre'],
        'smile': ['smile', 'smiling', 'smiles', 'happy'],
        'surprised': ['surprised', 'surprise', 'surprising', 'suprised']
    }
    
    # Try to find expression keyword in filename (case insensitive)
    name_lower = name.lower()
    expression = "unknown"
    found_keyword = None
    
    for expr, keywords in expression_keywords.items():
        for keyword in keywords:
            if keyword in name_lower:
                expression = expr
                found_keyword = keyword
                break
        if found_keyword:
            break
    
    # Extract member name by removing the expression keyword
    if found_keyword:
        # Remove the expression keyword and any separators around it
        # Handle both hyphen and underscore separators
        pattern = r'[\s_-]*' + re.escape(found_keyword) + r'[\s_-]*'
        member = re.sub(pattern, '', name, flags=re.IGNORECASE).strip()
        # Clean up any remaining separators at the end
        member = re.sub(r'[\s_-]+$', '', member)
    else:
        # No expression keyword found, try to split by common separators
        # Try underscore first
        if '_' in name:
            parts = name.split('_')
            if len(parts) >= 2:
                member = '_'.join(parts[:-1])
                expression = parts[-1].lower()
            else:
                member = name
        # Try hyphen
        elif '-' in name:
            parts = name.split('-')
            if len(parts) >= 2:
                member = '-'.join(parts[:-1])
                expression = parts[-1].lower()
            else:
                member = name
        # Try space (take everything except last word if it looks like a number or expression)
        elif ' ' in name:
            parts = name.split()
            # If last part is a number, it's probably not an expression
            if len(parts) >= 2 and not parts[-1].isdigit():
                member = ' '.join(parts[:-1])
                expression = parts[-1].lower()
            else:
                member = name
        else:
            member = name
    
    # Clean up member name
    member = member.strip()
    if not member:
        member = "unknown"
    
    return member, expression
Searching for images in: {IMAGES_DIR}")
if not os.path.exists(IMAGES_DIR):
    print(f"✗ ERROR: Images directory does not exist: {IMAGES_DIR}")
    print("Please make sure images are downloaded/copied to this directory first.")
else:
    image_files = []
    for root, dirs, files in os.walk(IMAGES_DIR):
        for f in files:
            if f.lower().endswith((".jpg", ".jpeg", ".png", ".bmp", ".heic", ".heif")):
                image_files.append(os.path.join(root, f))
    
    if not image_files:
        print(f"✗ ERROR: No images found in {IMAGES_DIR}")
        print("Please add images with naming: memberX_neutral.jpg, memberX_smile.jpg, memberX_surprised.jpg")
    else:
        print(f"✓ Found {len(image_files)} images")
        
        # Process each image
        all_data = []
        for img_path in image_files:
            try:
                filename = os.path.basename(img_path)
                member, expression = parse_filename(filename)
                print(f"Processing: {filename} (Member: {member}, Expression: {expression})")
                
                img = load_image(img_path)
                histogram_features = extract_histogram_features(img)
                hog_features = extract_hog_features(img)
                embedding_features = extract_embedding_features(img)
                
                features = {**histogram_features, **hog_features, **embedding_features}
                features['filename'] = filename
                features['member'] = member
                features['expression'] = expression
                features['image_path'] = img_path
                
                all_data.append(features)
                print(f"  ✓ Successfully processed: {filename}")
            except Exception as e:
                print(f"✗ Error processing {filename}: {str(e)}")
                import traceback
                traceback.print_exc()
                continue
        
        if all_data:
            # Create DataFrame
            df = pd.DataFrame(all_data)
            metadata_cols = ['filename', 'member', 'expression', 'image_path']
            feature_cols = [c for c in df.columns if c not in metadata_cols]
            df = df[metadata_cols + feature_cols]
            
            # Save to CSV
            df.to_csv(OUTPUT_FILE, index=False)
            
            print(f"\n
" + "="*60)
            print("FEATURE EXTRACTION COMPLETE")
            print("="*60)
            print(f"✓ Successfully processed {len(all_data)} images")
            print(f"✓ Features saved to: {OUTPUT_FILE}")
            print(f"✓ File exists: {os.path.exists(OUTPUT_FILE)}")
            print(f"✓ File size: {os.path.getsize(OUTPUT_FILE)} bytes")
            print(f"✓ Total features per image: {len(feature_cols)}")
            print(f"\n
Feature breakdown:")
            print(f"  - Histogram features: {len([c for c in feature_cols if c.startswith('hist_')])}")
            print(f"  - HOG features: {len([c for c in feature_cols if c.startswith('hog_')])}")
            print(f"  - Embedding features: {len([c for c in feature_cols if c.startswith('embedding_')])}")
            print(f"\n
DataFrame shape: {df.shape}")
            print(f"\n
Sample data:")
            print(df[['filename', 'member', 'expression']].head())
        else:
            print("✗ ERROR: No images were successfully processed")
\n

In [None]:
# Generate features CSV by calling the script from the notebook
import subprocess, sys
import os

# Ensure BASE_DIR is defined (should be set in earlier cells)
if 'BASE_DIR' not in globals():
    BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
    print(f"BASE_DIR set to: {BASE_DIR}")

script_path = os.path.abspath(os.path.join(BASE_DIR, "scripts", "image_processing.py"))
print("="*60)
print("RUNNING IMAGE PROCESSING SCRIPT")
print("="*60)
print(f"Script path: {script_path}")
print(f"Script exists: {os.path.exists(script_path)}")

if not os.path.exists(script_path):
    print(f"\n✗ ERROR: Script not found at {script_path}")
    print("Please run the previous cell to download the script from Google Drive.")
else:
    print(f"\n✓ Script found. Running...")
    result = subprocess.run([sys.executable, script_path], capture_output=True, text=True)
    
    print("\n" + "-"*60)
    print("SCRIPT OUTPUT:")
    print("-"*60)
    print(result.stdout)
    
    if result.stderr:
        print("\n" + "-"*60)
        print("WARNINGS/ERRORS:")
        print("-"*60)
        print(result.stderr)
    
    if result.returncode != 0:
        print(f"\n✗ Script exited with error code: {result.returncode}")
    else:
        print(f"\n✓ Script completed successfully!")
    
    # Check if CSV was created
    output_file = os.path.join(BASE_DIR, "data", "processed", "image_features.csv")
    if os.path.exists(output_file):
        print(f"\n✓ CSV file created at: {output_file}")
        print(f"  File size: {os.path.getsize(output_file)} bytes")
    else:
        print(f"\n⚠ CSV file not found at: {output_file}")
        print("  Checking alternative locations...")
        
        # Try to find the CSV file
        possible_locations = [
            os.path.join(BASE_DIR, "data", "processed", "image_features.csv"),
            os.path.join(os.getcwd(), "data", "processed", "image_features.csv"),
            "data/processed/image_features.csv",
            "image_features.csv",
        ]
        
        found = False
        for loc in possible_locations:
            if os.path.exists(loc):
                print(f"  ✓ Found CSV at: {loc}")
                found = True
                break
        
        if not found:
            print("  ✗ CSV not found in any expected location")
            print("  Please check the script output above for errors")


In [None]:
# Verify and display the generated CSV file
import pandas as pd
import os

# Ensure BASE_DIR is defined
if 'BASE_DIR' not in globals():
    BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
    print(f"BASE_DIR set to: {BASE_DIR}")

# Try to find the CSV file in multiple possible locations
possible_paths = [
    os.path.join(BASE_DIR, "data", "processed", "image_features.csv"),
    os.path.join(os.getcwd(), "data", "processed", "image_features.csv"),
    os.path.join(os.getcwd(), "..", "data", "processed", "image_features.csv"),
    "data/processed/image_features.csv",
    "../data/processed/image_features.csv",
    "image_features.csv",
]

OUTPUT_FILE = None
for path in possible_paths:
    if os.path.exists(path):
        OUTPUT_FILE = os.path.abspath(path)
        print(f"✓ Found CSV file at: {OUTPUT_FILE}")
        break

if OUTPUT_FILE and os.path.exists(OUTPUT_FILE):
    df = pd.read_csv(OUTPUT_FILE)

    print("="*60)
    print("IMAGE FEATURES CSV VERIFICATION")
    print("="*60)
    print(f"\n✓ CSV file successfully created: {OUTPUT_FILE}")
    print(f"✓ Total images processed: {len(df)}")
    print(f"✓ Total features per image: {len(df.columns) - 4}")  # Excluding metadata columns

    print("\n" + "-"*60)
    print("FEATURE BREAKDOWN:")
    print("-"*60)

    # Count feature types
    hist_features = [c for c in df.columns if c.startswith('hist_')]
    hog_features = [c for c in df.columns if c.startswith('hog_')]
    embedding_features = [c for c in df.columns if c.startswith('embedding_')]

    print(f"  • Histogram features: {len(hist_features)}")
    print(f"  • HOG features: {len(hog_features)}")
    print(f"  • Embedding features: {len(embedding_features)}")

    print("\n" + "-"*60)
    print("MEMBERS AND EXPRESSIONS:")
    print("-"*60)
    member_summary = df.groupby('member')['expression'].value_counts().unstack(fill_value=0)
    print(member_summary)

    print("\n" + "-"*60)
    print("SAMPLE DATA (First 3 rows):")
    print("-"*60)
    display_cols = ['filename', 'member', 'expression'] + hist_features[:3] + hog_features[:3] + embedding_features[:3]
    print(df[display_cols].head(3).to_string())

    print("\n" + "-"*60)
    print("CSV FILE SUMMARY:")
    print("-"*60)
    print(f"  • File location: {OUTPUT_FILE}")
    print(f"  • File size: {os.path.getsize(OUTPUT_FILE) / 1024:.2f} KB")
    print(f"  • Shape: {df.shape[0]} rows × {df.shape[1]} columns")
    print(f"  • Missing values: {df.isnull().sum().sum()}")

    print("\n✓ All requirements for Task 2 appear to be met!")
    print("  ✓ Images loaded and displayed for all members")
    print("  ✓ Augmentations applied (rotation, flipping, grayscale)")
    print("  ✓ Features extracted (histograms, HOG, embeddings)")
    print("  ✓ Features saved to image_features.csv")

else:
    print("="*60)
    print("ERROR: CSV FILE NOT FOUND")
    print("="*60)
    print(f"\n✗ Could not find image_features.csv in any expected location.")
    print("\nSearched in:")
    for path in possible_paths:
        abs_path = os.path.abspath(path) if not os.path.isabs(path) else path
        exists = "✓" if os.path.exists(path) else "✗"
        print(f"  {exists} {abs_path}")

    print("\nTroubleshooting steps:")
    print("1. Make sure you ran the previous cell (Cell 7) to execute the script")
    print("2. Check the script output above for any errors")
    print("3. Verify that images exist in the images/ directory")
    print("4. Check if the script completed successfully")
    print("\nTo regenerate the CSV, run Cell 7 again.")

    # Try to check if images directory exists
    if 'IMAGES_DIR' in globals() or 'BASE_DIR' in globals():
        images_dir = globals().get('IMAGES_DIR', os.path.join(BASE_DIR, "images"))
        if os.path.exists(images_dir):
            image_files = [f for f in os.listdir(images_dir)
                          if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.heic', '.heif'))]
            print(f"\nImages directory status:")
            print(f"  Location: {images_dir}")
            print(f"  Images found: {len(image_files)}")
            if len(image_files) == 0:
                print("  ⚠ No images found! Please add images to the images/ directory first.")
            else:
                print(f"  Sample files: {image_files[:3]}")
        else:
            print(f"\n⚠ Images directory not found: {images_dir}")


# Task 4: Facial Recognition Model

This cell trains a facial recognition model to identify members from image features.
The model uses the features extracted in the previous cells.


In [None]:
# Train Facial Recognition Model
import os
import sys
import subprocess

# Ensure BASE_DIR is defined
if 'BASE_DIR' not in globals():
    BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Path to training script
script_path = os.path.join(BASE_DIR, "scripts", "facial_recognition_model.py")

print("="*60)
print("TRAINING FACIAL RECOGNITION MODEL")
print("="*60)
print(f"Script: {script_path}")

if not os.path.exists(script_path):
    print(f"✗ ERROR: Script not found at {script_path}")
    print("Please ensure the script exists.")
else:
    print(f"✓ Script found. Running...")
    result = subprocess.run([sys.executable, script_path], capture_output=True, text=True)
    
    print("\n" + "-"*60)
    print("TRAINING OUTPUT:")
    print("-"*60)
    print(result.stdout)
    
    if result.stderr:
        print("\n" + "-"*60)
        print("WARNINGS/ERRORS:")
        print("-"*60)
        print(result.stderr)
    
    if result.returncode == 0:
        print("\n✓ Model training completed successfully!")
        print(f"✓ Model saved to: {os.path.join(BASE_DIR, 'models', 'facial_recognition_model.pkl')}")
    else:
        print(f"\n✗ Training failed with exit code: {result.returncode}")

