In [1]:
# Install gdown if not already installed
!pip install -q gdown

# Download the file from Google Drive
file_id = "12IUNRSUmQx6YdXa_BbTfpztPoT2GfVNK"
!gdown --id {file_id} -O MLP_dataset.zip

# Create folder to store extracted files
!mkdir -p MLP_dataset

# Unzip the contents into MLP_dataset
!unzip -q MLP_dataset.zip -d MLP_dataset

# Verify contents
!ls MLP_dataset


Downloading...
From (original): https://drive.google.com/uc?id=12IUNRSUmQx6YdXa_BbTfpztPoT2GfVNK
From (redirected): https://drive.google.com/uc?id=12IUNRSUmQx6YdXa_BbTfpztPoT2GfVNK&confirm=t&uuid=7d595eaa-9ad2-4d34-8e62-b81c8b62da81
To: /content/MLP_dataset.zip
100% 203M/203M [00:02<00:00, 90.2MB/s]
Positive_FGNET	Positive_Morph


# count number of samples in MLP_dataset

In [2]:
import os

base_dir = "MLP_dataset"
subfolders = ["Positive_FGNET", "Positive_Morph"]

for folder in subfolders:
    folder_path = os.path.join(base_dir, folder)
    # Count files with common image extensions
    count = len([f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
    print(f"{folder}: {count} images")


Positive_FGNET: 0 images
Positive_Morph: 0 images


In [3]:
import os

# List everything inside MLP_dataset
base_dir = "/content/MLP_dataset"
for root, dirs, files in os.walk(base_dir):
    print("Current folder:", root)
    print("Subfolders:", dirs)
    print("Files:", files[:10])  # show first 10 files
    print("---")


Current folder: /content/MLP_dataset
Subfolders: ['Positive_Morph', '.ipynb_checkpoints', 'Positive_FGNET']
Files: []
---
Current folder: /content/MLP_dataset/Positive_Morph
Subfolders: ['063049_42-53', '07609_17-43', '129358_34-44', '13884_27-42', '41803_17-27', '132878_30-35', '110160_29-39', '285512_19-24', '063148_36-42', '5850_17-36', '265688_39-48', '096464_35-52', '37801_24-47', '130092_27-33', '105342_39-46', '122763_24-33', '098246_45-50', '320016_16-27', '2032_44-50', '080905_38-48', '063238_48-62', '078329_37-42', '054477_43-55', '096151_38-51', '02335_35-45', '053541_44-50', '108590_37-44', '099857_36-43', '092228_40-51', '127267_37-44', '313385_29-40', '08229_18-38', '041002_50-55', '12740_27-35', '104470_33-38', '03619_20-40', '2218_17-42', '0771_16-48', '105685_32-37', '18082_18-29', '115749_32-43', '28227_17-40', '103847_40-45', '150629_42-53', '196217_39-45', '328228_28-58', '76587_37-47', '076181_34-40', '01239_27-49', '20143_20-35', '8230_24-34', '02701_43-48', '0942

In [4]:
import os

def count_images(folder):
    image_count = 0
    for root, dirs, files in os.walk(folder):
        image_count += len([f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
    return image_count

base_dir = "/content/MLP_dataset"
folders = ["Positive_Morph", "Positive_FGNET"]

for f in folders:
    path = os.path.join(base_dir, f)
    total = count_images(path)
    print(f"{f}: {total} images")


Positive_Morph: 1410 images
Positive_FGNET: 4474 images


# Preprocess image pairs in Positive_FGNET

In [6]:
!pip install facenet_pytorch

Collecting facenet_pytorch
  Downloading facenet_pytorch-2.6.0-py3-none-any.whl.metadata (12 kB)
Collecting numpy<2.0.0,>=1.24.0 (from facenet_pytorch)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Pillow<10.3.0,>=10.2.0 (from facenet_pytorch)
  Downloading pillow-10.2.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting torch<2.3.0,>=2.2.0 (from facenet_pytorch)
  Downloading torch-2.2.2-cp312-cp312-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchvision<0.18.0,>=0.17.0 (from facenet_pytorch)
  Downloading torchvision-0.17.2-cp312-cp312-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch<2.3.0,>=2.2.0->facenet_pytorch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia

In [1]:
import gdown
import tensorflow as tf
from facenet_pytorch import MTCNN, InceptionResnetV1
from PIL import Image
import numpy as np
import cv2
from matplotlib import pyplot as plt
from scipy.spatial.distance import cosine
import torch

# -----------------------------
# 1. Load Models
# -----------------------------
# Age prediction model
file_id = "1xyGYBYRQwjBpPreZmvv_enQ3Z92wHNSx"
download_url = f"https://drive.google.com/uc?id={file_id}"
output_path = "resnet50_age_model.keras"
gdown.download(download_url, output_path, quiet=False)
age_model = tf.keras.models.load_model(output_path)

# Face detection
mtcnn = MTCNN()

# Face matching model
facenet = InceptionResnetV1(pretrained='vggface2').eval()

print("✓ All models loaded!")

# -----------------------------
# 2. Helper Functions
# -----------------------------
def align_face(img_cv, landmark, output_size=(224, 224)):
    """Align face based on eye landmarks"""
    left_eye, right_eye, nose = landmark[0], landmark[1], landmark[2]

    eyes_center = ((left_eye[0] + right_eye[0]) / 2,
                   (left_eye[1] + right_eye[1]) / 2)

    dy, dx = right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]
    angle = np.degrees(np.arctan2(dy, dx))

    desired_eye_distance = output_size[0] * 0.35
    dist = np.sqrt(dx**2 + dy**2)
    scale = desired_eye_distance / dist

    M = cv2.getRotationMatrix2D(eyes_center, angle, scale)
    M[0, 2] += (output_size[0] * 0.5 - eyes_center[0])
    M[1, 2] += (output_size[1] * 0.4 - eyes_center[1])

    aligned_face = cv2.warpAffine(img_cv, M, output_size, flags=cv2.INTER_CUBIC)
    return aligned_face

def preprocess_for_age(face_img):
    """Preprocess for age model (ResNet50)"""
    from tensorflow.keras.applications.resnet50 import preprocess_input
    x = cv2.cvtColor(face_img, cv2.COLOR_BGR2RGB)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return x

def preprocess_for_facenet(face_img):
    """Preprocess for FaceNet (160x160, normalized)"""
    face_resized = cv2.resize(face_img, (160, 160))
    face_rgb = cv2.cvtColor(face_resized, cv2.COLOR_BGR2RGB)
    face_normalized = (face_rgb - 127.5) / 128.0
    face_tensor = torch.FloatTensor(face_normalized).permute(2, 0, 1).unsqueeze(0)
    return face_tensor

# -----------------------------
# 3. Main Function
# -----------------------------
def match_faces(img_path1, img_path2, threshold=0.6):
    """
    Match two face images

    Args:
        img_path1, img_path2: Paths to images
        threshold: Similarity threshold (default 0.6)

    Returns:
        Dictionary with results
    """
    results = {}

    # Process Image 1
    img1 = Image.open(img_path1).convert('RGB')
    img1_cv = cv2.cvtColor(np.array(img1), cv2.COLOR_RGB2BGR)
    boxes1, probs1, landmarks1 = mtcnn.detect(img1, landmarks=True)

    if boxes1 is None:
        print(f"No face detected in {img_path1}")
        return None

    aligned_face1 = align_face(img1_cv, landmarks1[0], output_size=(224, 224))

    # Age prediction for Image 1
    age_input1 = preprocess_for_age(aligned_face1)
    age1 = age_model.predict(age_input1, verbose=0)[0][0]

    # Face embedding for Image 1
    face_tensor1 = preprocess_for_facenet(aligned_face1)
    with torch.no_grad():
        embedding1 = facenet(face_tensor1).numpy().flatten()

    # Process Image 2
    img2 = Image.open(img_path2).convert('RGB')
    img2_cv = cv2.cvtColor(np.array(img2), cv2.COLOR_RGB2BGR)
    boxes2, probs2, landmarks2 = mtcnn.detect(img2, landmarks=True)

    if boxes2 is None:
        print(f"No face detected in {img_path2}")
        return None

    aligned_face2 = align_face(img2_cv, landmarks2[0], output_size=(224, 224))

    # Age prediction for Image 2
    age_input2 = preprocess_for_age(aligned_face2)
    age2 = age_model.predict(age_input2, verbose=0)[0][0]

    # Face embedding for Image 2
    face_tensor2 = preprocess_for_facenet(aligned_face2)
    with torch.no_grad():
        embedding2 = facenet(face_tensor2).numpy().flatten()

    # Compute similarity
    similarity = 1 - cosine(embedding1, embedding2)
    is_match = similarity >= threshold

    # Store results
    results = {
        'age1': float(age1),
        'age2': float(age2),
        'age_difference': abs(float(age1 - age2)),
        'similarity': float(similarity),
        'is_match': is_match,
        'threshold': threshold,
        'face1': aligned_face1,
        'face2': aligned_face2
    }

    return results

Downloading...
From (original): https://drive.google.com/uc?id=1xyGYBYRQwjBpPreZmvv_enQ3Z92wHNSx
From (redirected): https://drive.google.com/uc?id=1xyGYBYRQwjBpPreZmvv_enQ3Z92wHNSx&confirm=t&uuid=9dfe924d-ab89-4395-a89a-774597f1ac50
To: /content/resnet50_age_model.keras
100%|██████████| 185M/185M [00:01<00:00, 124MB/s]


  0%|          | 0.00/107M [00:00<?, ?B/s]

✓ All models loaded!


In [2]:
import os
import csv
import gdown
import tensorflow as tf
from facenet_pytorch import MTCNN, InceptionResnetV1
from PIL import Image
import numpy as np
import cv2
import torch
from pathlib import Path

# -----------------------------
# 1. Load Models
# -----------------------------
print("Loading models...")
# Age prediction model
output_path = "resnet50_age_model.keras"
if not os.path.exists(output_path):
    print("Downloading age model...")
    file_id = "1xyGYBYRQwjBpPreZmvv_enQ3Z92wHNSx"
    download_url = f"https://drive.google.com/uc?id={file_id}"
    gdown.download(download_url, output_path, quiet=False)
else:
    print("Age model already exists, skipping download...")

age_model = tf.keras.models.load_model(output_path)

# Face detection
mtcnn = MTCNN()

# Face matching model
facenet = InceptionResnetV1(pretrained='vggface2').eval()

print("✓ All models loaded!")

# -----------------------------
# 2. Helper Functions
# -----------------------------
def align_face(img_cv, landmark, output_size=(224, 224)):
    """Align face based on eye landmarks"""
    left_eye, right_eye, nose = landmark[0], landmark[1], landmark[2]

    eyes_center = ((left_eye[0] + right_eye[0]) / 2,
                   (left_eye[1] + right_eye[1]) / 2)

    dy, dx = right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]
    angle = np.degrees(np.arctan2(dy, dx))

    desired_eye_distance = output_size[0] * 0.35
    dist = np.sqrt(dx**2 + dy**2)
    scale = desired_eye_distance / dist

    M = cv2.getRotationMatrix2D(eyes_center, angle, scale)
    M[0, 2] += (output_size[0] * 0.5 - eyes_center[0])
    M[1, 2] += (output_size[1] * 0.4 - eyes_center[1])

    aligned_face = cv2.warpAffine(img_cv, M, output_size, flags=cv2.INTER_CUBIC)
    return aligned_face

def preprocess_for_age(face_img):
    """Preprocess for age model (ResNet50)"""
    from tensorflow.keras.applications.resnet50 import preprocess_input
    # Ensure the image is in the correct format (BGR -> RGB)
    x = cv2.cvtColor(face_img, cv2.COLOR_BGR2RGB)
    # Convert to float32
    x = x.astype(np.float32)
    # Expand dimensions to add batch dimension
    x = np.expand_dims(x, axis=0)
    # Apply ResNet50 preprocessing
    x = preprocess_input(x)
    return x

def preprocess_for_facenet(face_img):
    """Preprocess for FaceNet (160x160, normalized)"""
    face_resized = cv2.resize(face_img, (160, 160))
    face_rgb = cv2.cvtColor(face_resized, cv2.COLOR_BGR2RGB)
    face_normalized = (face_rgb - 127.5) / 128.0
    face_tensor = torch.FloatTensor(face_normalized).permute(2, 0, 1).unsqueeze(0)
    return face_tensor

def process_image(img_path):
    """
    Process a single image: detect face, align, predict age, get embedding
    Returns: (age, embedding) or (None, None) if face not detected
    """
    try:
        # Load image
        img = Image.open(img_path).convert('RGB')
        img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

        # Detect face
        boxes, probs, landmarks = mtcnn.detect(img, landmarks=True)

        if boxes is None or len(boxes) == 0:
            print(f"  ⚠ No face detected in {img_path}")
            return None, None

        # Align face
        aligned_face = align_face(img_cv, landmarks[0], output_size=(224, 224))

        # Ensure aligned face is valid
        if aligned_face is None or aligned_face.shape != (224, 224, 3):
            print(f"  ⚠ Invalid aligned face shape in {img_path}")
            return None, None

        # Age prediction
        age_input = preprocess_for_age(aligned_face)

        # Ensure proper shape for age model
        if age_input.shape != (1, 224, 224, 3):
            print(f"  ⚠ Invalid age input shape: {age_input.shape}")
            return None, None

        age = age_model.predict(age_input, verbose=0)[0][0]

        # Face embedding
        face_tensor = preprocess_for_facenet(aligned_face)
        with torch.no_grad():
            embedding = facenet(face_tensor).numpy().flatten()

        return float(age), embedding.tolist()

    except Exception as e:
        print(f"  ✗ Error processing {img_path}: {str(e)}")
        import traceback
        traceback.print_exc()
        return None, None

def process_dataset(dataset_root, output_csv):
    """
    Process all image pairs in the dataset

    Args:
        dataset_root: Path to Positive_FGNET folder
        output_csv: Path to output CSV file
    """
    dataset_path = Path(dataset_root)

    # Prepare CSV
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        # Write header
        writer.writerow(['image_path1', 'image_path2', 'age1', 'age2',
                        'embedding1', 'embedding2'])

        # Iterate through person folders
        person_folders = sorted([f for f in dataset_path.iterdir() if f.is_dir()])

        print(f"\nFound {len(person_folders)} person folders")
        print("="*60)

        total_pairs = 0
        successful_pairs = 0

        for person_folder in person_folders:
            person_name = person_folder.name
            print(f"\nProcessing {person_name}...")

            # Iterate through pair folders within each person folder
            pair_folders = sorted([f for f in person_folder.iterdir() if f.is_dir()])

            for pair_folder in pair_folders:
                pair_name = pair_folder.name
                print(f"  Processing pair: {pair_name}")

                # Get all image files in the pair folder
                image_files = sorted([f for f in pair_folder.iterdir()
                                    if f.suffix.lower() in ['.jpg', '.jpeg', '.png']])

                if len(image_files) != 2:
                    print(f"  ⚠ Expected 2 images, found {len(image_files)}. Skipping.")
                    continue

                total_pairs += 1

                img_path1 = str(image_files[0])
                img_path2 = str(image_files[1])

                # Process both images
                age1, embedding1 = process_image(img_path1)
                age2, embedding2 = process_image(img_path2)

                # Save if both images processed successfully
                if age1 is not None and age2 is not None:
                    writer.writerow([img_path1, img_path2, age1, age2,
                                   embedding1, embedding2])
                    successful_pairs += 1
                    print(f"  ✓ Success: Age1={age1:.1f}, Age2={age2:.1f}")
                else:
                    print(f"  ✗ Failed to process pair")

        print("\n" + "="*60)
        print(f"Processing complete!")
        print(f"Total pairs found: {total_pairs}")
        print(f"Successfully processed: {successful_pairs}")
        print(f"Failed: {total_pairs - successful_pairs}")
        print(f"Results saved to: {output_csv}")
        print("="*60)

# -----------------------------
# 3. Main Execution
# -----------------------------
if __name__ == "__main__":
    # Set your dataset path
    dataset_root = "MLP_dataset/Positive_FGNET"
    output_csv = "fgnet_preprocessing_results.csv"

    # Process the dataset
    process_dataset(dataset_root, output_csv)

Loading models...
Age model already exists, skipping download...
✓ All models loaded!

Found 82 person folders

Processing person001...
  Processing pair: person001_02_14
  ✓ Success: Age1=4.7, Age2=27.2
  Processing pair: person001_02_16
  ✓ Success: Age1=4.7, Age2=20.9
  Processing pair: person001_02_18
  ✓ Success: Age1=4.7, Age2=31.1
  Processing pair: person001_02_19
  ✓ Success: Age1=4.7, Age2=24.0
  Processing pair: person001_02_22
  ✓ Success: Age1=4.7, Age2=34.2
  Processing pair: person001_02_28
  ✓ Success: Age1=4.7, Age2=19.4
  Processing pair: person001_02_29
  ✓ Success: Age1=4.7, Age2=29.2
  Processing pair: person001_02_33
  ✓ Success: Age1=4.7, Age2=33.3
  Processing pair: person001_02_40
  ✓ Success: Age1=4.7, Age2=39.2
  Processing pair: person001_05_16
  ✓ Success: Age1=11.9, Age2=20.9
  Processing pair: person001_05_18
  ✓ Success: Age1=11.9, Age2=31.1
  Processing pair: person001_05_19
  ✓ Success: Age1=11.9, Age2=24.0
  Processing pair: person001_05_22
  ✓ Succes

# Preprocessing of PositiveMorph

In [7]:
import os
import csv
import gdown
import tensorflow as tf
from facenet_pytorch import MTCNN, InceptionResnetV1
from PIL import Image
import numpy as np
import cv2
import torch
from pathlib import Path
from itertools import combinations

# Fix TensorFlow GPU memory issues
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"✓ GPU memory growth enabled for {len(gpus)} GPU(s)")
    except RuntimeError as e:
        print(f"GPU configuration error: {e}")
else:
    print("Running on CPU")

# -----------------------------
# 1. Load Models
# -----------------------------
print("Loading models...")
output_path = "resnet50_age_model.keras"
if not os.path.exists(output_path):
    print("Downloading age model...")
    file_id = "1xyGYBYRQwjBpPreZmvv_enQ3Z92wHNSx"
    download_url = f"https://drive.google.com/uc?id={file_id}"
    gdown.download(download_url, output_path, quiet=False)
else:
    print("Age model already exists, skipping download...")

age_model = tf.keras.models.load_model(output_path)
mtcnn = MTCNN()
facenet = InceptionResnetV1(pretrained='vggface2').eval()
print("✓ All models loaded!")

# -----------------------------
# 2. Helper Functions
# -----------------------------
def align_face(img_cv, landmark, output_size=(224, 224)):
    left_eye, right_eye, nose = landmark[0], landmark[1], landmark[2]
    eyes_center = ((left_eye[0] + right_eye[0]) / 2,
                   (left_eye[1] + right_eye[1]) / 2)
    dy, dx = right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]
    angle = np.degrees(np.arctan2(dy, dx))
    desired_eye_distance = output_size[0] * 0.35
    dist = np.sqrt(dx**2 + dy**2)
    scale = desired_eye_distance / dist
    M = cv2.getRotationMatrix2D(eyes_center, angle, scale)
    M[0, 2] += (output_size[0] * 0.5 - eyes_center[0])
    M[1, 2] += (output_size[1] * 0.4 - eyes_center[1])
    aligned_face = cv2.warpAffine(img_cv, M, output_size, flags=cv2.INTER_CUBIC)
    return aligned_face

def preprocess_for_age(face_img):
    from tensorflow.keras.applications.resnet50 import preprocess_input
    x = cv2.cvtColor(face_img, cv2.COLOR_BGR2RGB)
    x = x.astype(np.float32)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return x

def preprocess_for_facenet(face_img):
    face_resized = cv2.resize(face_img, (160, 160))
    face_rgb = cv2.cvtColor(face_resized, cv2.COLOR_BGR2RGB)
    face_normalized = (face_rgb - 127.5) / 128.0
    face_tensor = torch.FloatTensor(face_normalized).permute(2, 0, 1).unsqueeze(0)
    return face_tensor

def process_image(img_path):
    try:
        img = Image.open(img_path).convert('RGB')
        img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
        boxes, probs, landmarks = mtcnn.detect(img, landmarks=True)
        if boxes is None or len(boxes) == 0:
            print(f"  ⚠ No face detected in {img_path}")
            return None, None
        aligned_face = align_face(img_cv, landmarks[0], output_size=(224, 224))
        if aligned_face is None or aligned_face.shape != (224, 224, 3):
            print(f"  ⚠ Invalid aligned face shape in {img_path}")
            return None, None
        age_input = preprocess_for_age(aligned_face)
        if age_input.shape != (1, 224, 224, 3):
            print(f"  ⚠ Invalid age input shape: {age_input.shape}")
            return None, None
        age = age_model.predict(age_input, verbose=0)[0][0]
        face_tensor = preprocess_for_facenet(aligned_face)
        with torch.no_grad():
            embedding = facenet(face_tensor).numpy().flatten()
        return float(age), embedding.tolist()
    except Exception as e:
        print(f"  ✗ Error processing {img_path}: {str(e)}")
        import traceback
        traceback.print_exc()
        return None, None

def process_dataset(dataset_root, output_csv):
    dataset_path = Path(dataset_root)
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['image_path1', 'image_path2', 'age1', 'age2', 'embedding1', 'embedding2'])
        person_folders = sorted([f for f in dataset_path.iterdir() if f.is_dir()])
        print(f"\nFound {len(person_folders)} person folders")
        print("="*60)
        total_pairs = 0
        successful_pairs = 0
        for person_folder in person_folders:
            person_name = person_folder.name
            print(f"\nProcessing {person_name}...")
            image_files = sorted([f for f in person_folder.iterdir()
                                  if f.suffix.lower() in ['.jpg', '.jpeg', '.png']])
            if len(image_files) < 2:
                print(f"  ⚠ Found only {len(image_files)} image(s). Skipping.")
                continue
            pairs = list(combinations(image_files, 2))
            print(f"  Found {len(image_files)} images → {len(pairs)} pairs")
            for idx, (img_file1, img_file2) in enumerate(pairs, 1):
                total_pairs += 1
                img_path1 = str(img_file1)
                img_path2 = str(img_file2)
                print(f"  Pair {idx}/{len(pairs)}: {img_file1.name} + {img_file2.name}")
                age1, embedding1 = process_image(img_path1)
                age2, embedding2 = process_image(img_path2)
                if age1 is not None and age2 is not None:
                    writer.writerow([img_path1, img_path2, age1, age2, embedding1, embedding2])
                    successful_pairs += 1
                    print(f"    ✓ Success: Age1={age1:.1f}, Age2={age2:.1f}")
                else:
                    print(f"    ✗ Failed to process pair")
        print("\n" + "="*60)
        print(f"Processing complete!")
        print(f"Total pairs found: {total_pairs}")
        print(f"Successfully processed: {successful_pairs}")
        print(f"Failed: {total_pairs - successful_pairs}")
        print(f"Results saved to: {output_csv}")
        print("="*60)

# -----------------------------
# 3. Main Execution
# -----------------------------
if __name__ == "__main__":
    dataset_root = "/content/MLP_dataset/Positive_Morph"
    output_csv = "morph_preprocessing_results.csv"
    process_dataset(dataset_root, output_csv)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    ✓ Success: Age1=30.3, Age2=28.1
  Pair 40/153: 119392_13M32.JPG + 119392_24M33.JPG
    ✓ Success: Age1=30.3, Age2=31.5
  Pair 41/153: 119392_13M32.JPG + 119392_25M34.JPG
    ✓ Success: Age1=30.3, Age2=32.3
  Pair 42/153: 119392_13M32.JPG + 119392_26M35.JPG
    ✓ Success: Age1=30.3, Age2=29.3
  Pair 43/153: 119392_13M32.JPG + 119392_27M36.JPG
    ✓ Success: Age1=30.3, Age2=30.9
  Pair 44/153: 119392_13M32.JPG + 119392_2M31.JPG
    ✓ Success: Age1=30.3, Age2=31.6
  Pair 45/153: 119392_13M32.JPG + 119392_3M31.JPG
    ✓ Success: Age1=30.3, Age2=27.5
  Pair 46/153: 119392_13M32.JPG + 119392_4M31.JPG
    ✓ Success: Age1=30.3, Age2=31.5
  Pair 47/153: 119392_13M32.JPG + 119392_6M32.JPG
    ✓ Success: Age1=30.3, Age2=38.6
  Pair 48/153: 119392_13M32.JPG + 119392_9M32.JPG
    ✓ Success: Age1=30.3, Age2=33.7
  Pair 49/153: 119392_14M33.JPG + 119392_15M33.JPG
    ✓ Success: Age1=32.1, Age2=32.3
  Pair 50/153: 119392_14M33.JPG + 

# Negative Labels data collection

# UTKFace data collection

In [8]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jangedoo/utkface-new")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'utkface-new' dataset.
Path to dataset files: /kaggle/input/utkface-new


In [9]:
import shutil
import os

# Source directory (downloaded by kagglehub)
src = "/kaggle/input/utkface-new"

# Target directory where you want to copy the dataset
dst = "/content/UTKface"

# Create destination folder if not exists
os.makedirs(dst, exist_ok=True)

# Copy entire directory tree
shutil.copytree(src, dst, dirs_exist_ok=True)

print("Dataset copied successfully to:", dst)


KeyboardInterrupt: 

In [11]:
import gdown
import zipfile
import os

# Google Drive file ID and output paths
file_id = "1Fndzbip2YzCYS5Efk1f9zFNjGVZKa2Oe"
output_zip = "utkface.zip"
output_dir = "utkface_final"

# Download the file
gdown.download(f"https://drive.google.com/uc?id={file_id}", output_zip, quiet=False)

# Create directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Unzip the downloaded file
with zipfile.ZipFile(output_zip, 'r') as zip_ref:
    zip_ref.extractall(output_dir)

print(f"Dataset extracted to '{output_dir}'")


Downloading...
From (original): https://drive.google.com/uc?id=1Fndzbip2YzCYS5Efk1f9zFNjGVZKa2Oe
From (redirected): https://drive.google.com/uc?id=1Fndzbip2YzCYS5Efk1f9zFNjGVZKa2Oe&confirm=t&uuid=3b121367-71c3-48af-b1a0-559732b6dbe5
To: /content/utkface.zip
100%|██████████| 347M/347M [00:04<00:00, 74.0MB/s]


Dataset extracted to 'utkface_final'


In [12]:
import os

# Directory containing images
img_dir = "/content/utkface_final/utkface_aligned_cropped/UTKFace"

# Count all .jpg files
num_jpg_files = len([f for f in os.listdir(img_dir) if f.lower().endswith(".jpg")])

print(f"Number of .jpg files: {num_jpg_files}")


Number of .jpg files: 23708


In [1]:
import os
import csv
import random
from pathlib import Path
from PIL import Image
import numpy as np
import cv2
import torch
import tensorflow as tf
from facenet_pytorch import MTCNN, InceptionResnetV1
import gdown

# -----------------------------
# GPU setup
# -----------------------------
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"✓ GPU memory growth enabled for {len(gpus)} GPU(s)")
    except RuntimeError as e:
        print(f"GPU configuration error: {e}")
else:
    print("Running on CPU")

# -----------------------------
# 1. Load Models
# -----------------------------
print("Loading models...")
output_path = "resnet50_age_model.keras"
if not os.path.exists(output_path):
    print("Downloading age model...")
    file_id = "1xyGYBYRQwjBpPreZmvv_enQ3Z92wHNSx"
    download_url = f"https://drive.google.com/uc?id={file_id}"
    gdown.download(download_url, output_path, quiet=False)
else:
    print("Age model already exists, skipping download...")

age_model = tf.keras.models.load_model(output_path)
mtcnn = MTCNN()
facenet = InceptionResnetV1(pretrained='vggface2').eval()
print("✓ All models loaded!")

# -----------------------------
# 2. Helper functions
# -----------------------------
def align_face(img_cv, landmark, output_size=(224, 224)):
    left_eye, right_eye, nose = landmark[0], landmark[1], landmark[2]
    eyes_center = ((left_eye[0] + right_eye[0]) / 2,
                   (left_eye[1] + right_eye[1]) / 2)
    dy, dx = right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]
    angle = np.degrees(np.arctan2(dy, dx))
    desired_eye_distance = output_size[0] * 0.35
    dist = np.sqrt(dx**2 + dy**2)
    scale = desired_eye_distance / dist
    M = cv2.getRotationMatrix2D(eyes_center, angle, scale)
    M[0, 2] += (output_size[0] * 0.5 - eyes_center[0])
    M[1, 2] += (output_size[1] * 0.4 - eyes_center[1])
    aligned_face = cv2.warpAffine(img_cv, M, output_size, flags=cv2.INTER_CUBIC)
    return aligned_face

def preprocess_for_age(face_img):
    from tensorflow.keras.applications.resnet50 import preprocess_input
    x = cv2.cvtColor(face_img, cv2.COLOR_BGR2RGB)
    x = x.astype(np.float32)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return x

def preprocess_for_facenet(face_img):
    face_resized = cv2.resize(face_img, (160, 160))
    face_rgb = cv2.cvtColor(face_resized, cv2.COLOR_BGR2RGB)
    face_normalized = (face_rgb - 127.5) / 128.0
    face_tensor = torch.FloatTensor(face_normalized).permute(2, 0, 1).unsqueeze(0)
    return face_tensor

def process_image(img_path):
    try:
        img = Image.open(img_path).convert('RGB')
        img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
        boxes, probs, landmarks = mtcnn.detect(img, landmarks=True)
        if boxes is None or len(boxes) == 0:
            print(f"  ⚠ No face detected in {img_path}")
            return None, None
        aligned_face = align_face(img_cv, landmarks[0], output_size=(224, 224))
        age_input = preprocess_for_age(aligned_face)
        age = age_model.predict(age_input, verbose=0)[0][0]
        face_tensor = preprocess_for_facenet(aligned_face)
        with torch.no_grad():
            embedding = facenet(face_tensor).numpy().flatten()
        return float(age), embedding.tolist()
    except Exception as e:
        print(f"  ✗ Error processing {img_path}: {str(e)}")
        return None, None

# -----------------------------
# 3. UTKFace dataset processing (memory-efficient)
# -----------------------------
def process_utkface(dataset_root, output_csv, num_pairs=8111):
    dataset_path = Path(dataset_root)
    all_images = sorted([f for f in dataset_path.iterdir()
                         if f.suffix.lower() in ['.jpg', '.jpeg', '.png']])
    print(f"Found {len(all_images)} images in UTKFace folder")

    if len(all_images) < 2:
        print("Not enough images to form pairs!")
        return

    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['image_path1', 'image_path2', 'age1', 'age2', 'embedding1', 'embedding2'])
        total_pairs = 0
        successful_pairs = 0
        tried_pairs = set()  # to avoid duplicate pairs

        while total_pairs < num_pairs:
            img_file1, img_file2 = random.sample(all_images, 2)
            pair_key = tuple(sorted([str(img_file1), str(img_file2)]))
            if pair_key in tried_pairs:
                continue
            tried_pairs.add(pair_key)

            total_pairs += 1
            print(f"Processing pair {total_pairs}/{num_pairs}: {img_file1.name} + {img_file2.name}")
            age1, embedding1 = process_image(str(img_file1))
            age2, embedding2 = process_image(str(img_file2))
            if age1 is not None and age2 is not None:
                writer.writerow([str(img_file1), str(img_file2), age1, age2, embedding1, embedding2])
                successful_pairs += 1
                print(f"  ✓ Success: Age1={age1:.1f}, Age2={age2:.1f}")
            else:
                print(f"  ✗ Failed to process pair")

    print(f"\nProcessing complete!")
    print(f"Total pairs attempted: {total_pairs}")
    print(f"Successfully processed: {successful_pairs}")
    print(f"Failed: {total_pairs - successful_pairs}")
    print(f"Results saved to: {output_csv}")

# -----------------------------
# 4. Main execution
# -----------------------------
if __name__ == "__main__":
    dataset_root = "/content/utkface_final/utkface_aligned_cropped/UTKFace"
    output_csv = "utkface_preprocessing_results.csv"
    process_utkface(dataset_root, output_csv, num_pairs=8111)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  ✓ Success: Age1=30.1, Age2=32.6
Processing pair 5624/8111: 50_0_1_20170113152855753.jpg.chip.jpg + 2_1_1_20161219155833405.jpg.chip.jpg
  ✓ Success: Age1=35.1, Age2=1.8
Processing pair 5625/8111: 27_1_1_20170112234022671.jpg.chip.jpg + 31_0_1_20170116024000066.jpg.chip.jpg
  ✓ Success: Age1=37.2, Age2=26.6
Processing pair 5626/8111: 26_1_4_20170103225928346.jpg.chip.jpg + 39_0_4_20170104183739429.jpg.chip.jpg
  ✓ Success: Age1=20.3, Age2=42.6
Processing pair 5627/8111: 24_1_4_20170117150628875.jpg.chip.jpg + 16_0_0_20170110232315216.jpg.chip.jpg
  ✓ Success: Age1=31.1, Age2=15.3
Processing pair 5628/8111: 45_0_1_20170116010527476.jpg.chip.jpg + 20_0_1_20170113132705262.jpg.chip.jpg
  ✓ Success: Age1=38.1, Age2=21.7
Processing pair 5629/8111: 75_0_0_20170111200622706.jpg.chip.jpg + 15_0_4_20170103201002253.jpg.chip.jpg
  ✓ Success: Age1=54.4, Age2=17.5
Processing pair 5630/8111: 47_0_0_20170104211822836.jpg.chip.jpg + 24

# Merge Positive pairs csv files

In [2]:
import pandas as pd

# Load both CSV files
fgnet_df = pd.read_csv("/content/fgnet_preprocessing_results.csv")
morph_df = pd.read_csv("/content/morph_preprocessing_results.csv")

# Merge them vertically (append rows)
merged_df = pd.concat([fgnet_df, morph_df], ignore_index=True)

# Print merged column names
print("Merged Columns:")
print(merged_df.columns.tolist())


Merged Columns:
['image_path1', 'image_path2', 'age1', 'age2', 'embedding1', 'embedding2']


  merged_df = pd.concat([fgnet_df, morph_df], ignore_index=True)


# Add person id in fgnet_csv

# Add person_id to morph csv

In [8]:
import pandas as pd

df = pd.read_csv('/content/morph_preprocessing_results.csv')

print(df.head())


                                         image_path1  \
0  /content/MLP_dataset/Positive_Morph/0007_16-44...   
1  /content/MLP_dataset/Positive_Morph/0007_16-44...   
2  /content/MLP_dataset/Positive_Morph/0007_16-44...   
3  /content/MLP_dataset/Positive_Morph/00423_42-5...   
4  /content/MLP_dataset/Positive_Morph/00423_42-5...   

                                         image_path2       age1       age2  \
0  /content/MLP_dataset/Positive_Morph/0007_16-44...  18.461182  22.550301   
1  /content/MLP_dataset/Positive_Morph/0007_16-44...  18.461182  23.926910   
2  /content/MLP_dataset/Positive_Morph/0007_16-44...  22.550301  23.926910   
3  /content/MLP_dataset/Positive_Morph/00423_42-5...  29.294632  22.782116   
4  /content/MLP_dataset/Positive_Morph/00423_42-5...  29.294632  24.953445   

                                          embedding1  \
0  [0.06365031003952026, -0.023031221702694893, 0...   
1  [0.06365031003952026, -0.023031221702694893, 0...   
2  [0.06384401023387909, -

In [10]:
import pandas as pd
import os

# Load original CSV
df = pd.read_csv('/content/morph_preprocessing_results.csv')

# Extract the folder name (person folder)
def extract_folder(path):
    return os.path.basename(os.path.dirname(path))

df['folder'] = df['image_path1'].apply(extract_folder)

# Create numeric IDs for each unique folder
unique_folders = df['folder'].unique()
folder_to_id = {folder: idx+1 for idx, folder in enumerate(unique_folders)}

# Map folder → numeric ID
df['person_id'] = df['folder'].map(folder_to_id)

# Remove the temporary folder column
df = df.drop(columns=['folder'])

# Save to new CSV
output_path = '/content/morph_with_id.csv'
df.to_csv(output_path, index=False)

print("Saved new CSV:", output_path)
print(df.head(20))



Saved new CSV: /content/morph_with_id.csv
                                          image_path1  \
0   /content/MLP_dataset/Positive_Morph/0007_16-44...   
1   /content/MLP_dataset/Positive_Morph/0007_16-44...   
2   /content/MLP_dataset/Positive_Morph/0007_16-44...   
3   /content/MLP_dataset/Positive_Morph/00423_42-5...   
4   /content/MLP_dataset/Positive_Morph/00423_42-5...   
5   /content/MLP_dataset/Positive_Morph/00423_42-5...   
6   /content/MLP_dataset/Positive_Morph/01083_25-3...   
7   /content/MLP_dataset/Positive_Morph/01083_25-3...   
8   /content/MLP_dataset/Positive_Morph/01083_25-3...   
9   /content/MLP_dataset/Positive_Morph/01083_25-3...   
10  /content/MLP_dataset/Positive_Morph/01083_25-3...   
11  /content/MLP_dataset/Positive_Morph/01083_25-3...   
12  /content/MLP_dataset/Positive_Morph/01239_27-4...   
13  /content/MLP_dataset/Positive_Morph/0142_19-41...   
14  /content/MLP_dataset/Positive_Morph/0142_19-41...   
15  /content/MLP_dataset/Positive_Morph/0142_1

In [11]:
print("Last person_id:", df['person_id'].max())


Last person_id: 289


In [12]:
print(df.tail(20))   # shows last 20 rows


                                            image_path1  \
5872  /content/MLP_dataset/Positive_Morph/95104_33-4...   
5873  /content/MLP_dataset/Positive_Morph/97729_20-3...   
5874  /content/MLP_dataset/Positive_Morph/97729_20-3...   
5875  /content/MLP_dataset/Positive_Morph/97729_20-3...   
5876  /content/MLP_dataset/Positive_Morph/98025_22-2...   
5877  /content/MLP_dataset/Positive_Morph/9893_50-58...   
5878  /content/MLP_dataset/Positive_Morph/9893_50-58...   
5879  /content/MLP_dataset/Positive_Morph/9893_50-58...   
5880  /content/MLP_dataset/Positive_Morph/9893_50-58...   
5881  /content/MLP_dataset/Positive_Morph/9893_50-58...   
5882  /content/MLP_dataset/Positive_Morph/9893_50-58...   
5883  /content/MLP_dataset/Positive_Morph/9893_50-58...   
5884  /content/MLP_dataset/Positive_Morph/9893_50-58...   
5885  /content/MLP_dataset/Positive_Morph/9893_50-58...   
5886  /content/MLP_dataset/Positive_Morph/9893_50-58...   
5887  /content/MLP_dataset/Positive_Morph/9893_50-58... 

# FGNET ADDING PERSON_ID

In [13]:
import pandas as pd

df2 = pd.read_excel('/content/fgnet_preprocessing_results_v2.xlsx')

print(df2.head())


                                         image_path1  \
0  MLP_dataset/Positive_FGNET/person001/person001...   
1  MLP_dataset/Positive_FGNET/person001/person001...   
2  MLP_dataset/Positive_FGNET/person001/person001...   
3  MLP_dataset/Positive_FGNET/person001/person001...   
4  MLP_dataset/Positive_FGNET/person001/person001...   

                                         image_path2      age1       age2  \
0  MLP_dataset/Positive_FGNET/person001/person001...  4.660048  27.186808   
1  MLP_dataset/Positive_FGNET/person001/person001...  4.660048  20.907551   
2  MLP_dataset/Positive_FGNET/person001/person001...  4.660048  31.123674   
3  MLP_dataset/Positive_FGNET/person001/person001...  4.660048  23.976097   
4  MLP_dataset/Positive_FGNET/person001/person001...  4.660048  34.156021   

                                          embedding1  \
0  [-0.021442148834466934, 0.04521297663450241, 0...   
1  [-0.021442148834466934, 0.04521297663450241, 0...   
2  [-0.021442148834466934, 0.045

In [14]:
import pandas as pd
import os

# Load Excel file
df2 = pd.read_excel('/content/fgnet_preprocessing_results_v2.xlsx')

# Function to extract the subfolder after Positive_FGNET
def extract_person_folder(path):
    parts = path.split(os.sep)  # split by path separator
    # find the index of 'Positive_FGNET' and take the next folder
    if 'Positive_FGNET' in parts:
        idx = parts.index('Positive_FGNET')
        return parts[idx + 1]
    else:
        return 'unknown'

# Apply to image_path1
df2['folder'] = df2['image_path1'].apply(extract_person_folder)

# Get unique folders
unique_folders = df2['folder'].unique()

# Map folders to numeric IDs starting from 290
folder_to_id = {folder: idx + 290 for idx, folder in enumerate(unique_folders)}
df2['person_id'] = df2['folder'].map(folder_to_id)

# Drop temporary folder column
df2 = df2.drop(columns=['folder'])

# Save updated Excel file
output_path = '/content/fgnet2_with_id.xlsx'
df2.to_excel(output_path, index=False)

print("Saved new Excel file with person_id:", output_path)
print(df2.head(20))   # show top 20 rows
print("Last person_id:", df2['person_id'].max())


Saved new Excel file with person_id: /content/fgnet2_with_id.xlsx
                                          image_path1  \
0   MLP_dataset/Positive_FGNET/person001/person001...   
1   MLP_dataset/Positive_FGNET/person001/person001...   
2   MLP_dataset/Positive_FGNET/person001/person001...   
3   MLP_dataset/Positive_FGNET/person001/person001...   
4   MLP_dataset/Positive_FGNET/person001/person001...   
5   MLP_dataset/Positive_FGNET/person001/person001...   
6   MLP_dataset/Positive_FGNET/person001/person001...   
7   MLP_dataset/Positive_FGNET/person001/person001...   
8   MLP_dataset/Positive_FGNET/person001/person001...   
9   MLP_dataset/Positive_FGNET/person001/person001...   
10  MLP_dataset/Positive_FGNET/person001/person001...   
11  MLP_dataset/Positive_FGNET/person001/person001...   
12  MLP_dataset/Positive_FGNET/person001/person001...   
13  MLP_dataset/Positive_FGNET/person001/person001...   
14  MLP_dataset/Positive_FGNET/person001/person001...   
15  MLP_dataset/Positi

# Merge both

In [17]:
import pandas as pd

# Load the files
fgnet_df = pd.read_excel('/content/fgnet2_with_id.xlsx')
morph_df = pd.read_csv('/content/morph_with_id.csv')

# Merge the datasets (concatenate rows)
merged_df = pd.concat([morph_df,fgnet_df ], ignore_index=True)

# Print the columns of the merged dataframe
print("Columns in merged dataframe:", merged_df.columns.tolist())

# Optional: show top rows
print(merged_df.head())
print(merged_df.tail())

Columns in merged dataframe: ['image_path1', 'image_path2', 'age1', 'age2', 'embedding1', 'embedding2', 'person_id']
                                         image_path1  \
0  /content/MLP_dataset/Positive_Morph/0007_16-44...   
1  /content/MLP_dataset/Positive_Morph/0007_16-44...   
2  /content/MLP_dataset/Positive_Morph/0007_16-44...   
3  /content/MLP_dataset/Positive_Morph/00423_42-5...   
4  /content/MLP_dataset/Positive_Morph/00423_42-5...   

                                         image_path2       age1       age2  \
0  /content/MLP_dataset/Positive_Morph/0007_16-44...  18.461182  22.550301   
1  /content/MLP_dataset/Positive_Morph/0007_16-44...  18.461182  23.926910   
2  /content/MLP_dataset/Positive_Morph/0007_16-44...  22.550301  23.926910   
3  /content/MLP_dataset/Positive_Morph/00423_42-5...  29.294632  22.782116   
4  /content/MLP_dataset/Positive_Morph/00423_42-5...  29.294632  24.953445   

                                          embedding1  \
0  [0.063650310039520

# Add label =1 and age range and embedding difference

In [19]:
import pandas as pd
import ast
import numpy as np

# Load the files
fgnet_df = pd.read_excel('/content/fgnet2_with_id.xlsx')
morph_df = pd.read_csv('/content/morph_with_id.csv')

# Merge the datasets
merged_df = pd.concat([fgnet_df, morph_df], ignore_index=True)

# Add label column
merged_df['label'] = 1

# Compute age difference
merged_df['age_range'] = (merged_df['age1'] - merged_df['age2']).abs()

# Convert embeddings from string to list
def str_to_array(s):
    return np.array(ast.literal_eval(s))

merged_df['embedding1'] = merged_df['embedding1'].apply(str_to_array)
merged_df['embedding2'] = merged_df['embedding2'].apply(str_to_array)

# Compute L1 distance (element-wise absolute difference sum)
merged_df['embedding_diff'] = merged_df.apply(
    lambda row: np.sum(np.abs(row['embedding1'] - row['embedding2'])),
    axis=1
)

# Save to CSV
merged_df.to_csv('/content/Final_Positive.csv', index=False)

# Print top rows to verify
print(merged_df.head())


                                         image_path1  \
0  MLP_dataset/Positive_FGNET/person001/person001...   
1  MLP_dataset/Positive_FGNET/person001/person001...   
2  MLP_dataset/Positive_FGNET/person001/person001...   
3  MLP_dataset/Positive_FGNET/person001/person001...   
4  MLP_dataset/Positive_FGNET/person001/person001...   

                                         image_path2      age1       age2  \
0  MLP_dataset/Positive_FGNET/person001/person001...  4.660048  27.186808   
1  MLP_dataset/Positive_FGNET/person001/person001...  4.660048  20.907551   
2  MLP_dataset/Positive_FGNET/person001/person001...  4.660048  31.123674   
3  MLP_dataset/Positive_FGNET/person001/person001...  4.660048  23.976097   
4  MLP_dataset/Positive_FGNET/person001/person001...  4.660048  34.156021   

                                          embedding1  \
0  [-0.021442148834466934, 0.04521297663450241, 0...   
1  [-0.021442148834466934, 0.04521297663450241, 0...   
2  [-0.021442148834466934, 0.045

In [20]:
print(merged_df.tail())

                                            image_path1  \
8106  /content/MLP_dataset/Positive_Morph/9893_50-58...   
8107  /content/MLP_dataset/Positive_Morph/9893_50-58...   
8108  /content/MLP_dataset/Positive_Morph/9893_50-58...   
8109  /content/MLP_dataset/Positive_Morph/9893_50-58...   
8110  /content/MLP_dataset/Positive_Morph/9893_50-58...   

                                            image_path2       age1       age2  \
8106  /content/MLP_dataset/Positive_Morph/9893_50-58...  52.085430  46.217159   
8107  /content/MLP_dataset/Positive_Morph/9893_50-58...  52.085430  57.117596   
8108  /content/MLP_dataset/Positive_Morph/9893_50-58...  49.811337  46.217159   
8109  /content/MLP_dataset/Positive_Morph/9893_50-58...  49.811337  57.117596   
8110  /content/MLP_dataset/Positive_Morph/9893_50-58...  46.217159  57.117596   

                                             embedding1  \
8106  [0.07272685319185257, -0.11134401708841324, 0....   
8107  [0.07272685319185257, -0.111344017

In [21]:
from google.colab import drive
import shutil

# Mount Google Drive
drive.mount('/content/drive')

# Define the destination path in your Drive
destination_path = '/content/drive/MyDrive/Final_Positive.csv'

# Copy the file to Drive
shutil.copy('/content/Final_Positive.csv', destination_path)

print(f"File saved to: {destination_path}")


Mounted at /content/drive
File saved to: /content/drive/MyDrive/Final_Positive.csv


# Prepare negative pairs csv file

In [22]:
import pandas as pd

# Load the CSV
utk_df = pd.read_csv('/content/utkface_preprocessing_results.csv')

# Add label column
utk_df['label'] = 0

# Add person_id column (unique for each row)
utk_df['person_id'] = range(1, len(utk_df) + 1)

# Save back to CSV (optional)
utk_df.to_csv('/content/utkface_with_id.csv', index=False)

# Print top rows to verify
print(utk_df.head())


                                         image_path1  \
0  /content/utkface_final/utkface_aligned_cropped...   
1  /content/utkface_final/utkface_aligned_cropped...   
2  /content/utkface_final/utkface_aligned_cropped...   
3  /content/utkface_final/utkface_aligned_cropped...   
4  /content/utkface_final/utkface_aligned_cropped...   

                                         image_path2       age1       age2  \
0  /content/utkface_final/utkface_aligned_cropped...  44.116760  26.153221   
1  /content/utkface_final/utkface_aligned_cropped...  47.330673  21.295252   
2  /content/utkface_final/utkface_aligned_cropped...  31.084494  45.400433   
3  /content/utkface_final/utkface_aligned_cropped...  38.734959  33.677563   
4  /content/utkface_final/utkface_aligned_cropped...  50.781651  30.622351   

                                          embedding1  \
0  [0.049472421407699585, 0.010554309003055096, 0...   
1  [0.003280659904703498, 0.012418401427567005, 0...   
2  [-0.044035427272319794,

In [23]:
import pandas as pd
import ast
import numpy as np
from google.colab import drive
import shutil

# --- Step 1: Load datasets ---

merged_df = pd.read_csv('/content/Final_Positive.csv')  # FGNET + Morph
utk_df = pd.read_csv('/content/utkface_preprocessing_results.csv')

# --- Step 2: Process UTKFace ---

# Add label
utk_df['label'] = 0

# Add person_id (unique per row)
utk_df['person_id'] = range(1, len(utk_df) + 1)

# Compute age difference if age1 and age2 exist
if 'age1' in utk_df.columns and 'age2' in utk_df.columns:
    utk_df['age_range'] = (utk_df['age1'] - utk_df['age2']).abs()
else:
    utk_df['age_range'] = 0  # fallback if no age columns

# Compute embedding difference if embedding1 and embedding2 exist
if 'embedding1' in utk_df.columns and 'embedding2' in utk_df.columns:
    def str_to_array(s):
        return np.array(ast.literal_eval(s))

    utk_df['embedding1'] = utk_df['embedding1'].apply(str_to_array)
    utk_df['embedding2'] = utk_df['embedding2'].apply(str_to_array)

    utk_df['embedding_diff'] = utk_df.apply(
        lambda row: np.sum(np.abs(row['embedding1'] - row['embedding2'])),
        axis=1
    )
else:
    utk_df['embedding_diff'] = 0

print(utk_df.head())

                                         image_path1  \
0  /content/utkface_final/utkface_aligned_cropped...   
1  /content/utkface_final/utkface_aligned_cropped...   
2  /content/utkface_final/utkface_aligned_cropped...   
3  /content/utkface_final/utkface_aligned_cropped...   
4  /content/utkface_final/utkface_aligned_cropped...   

                                         image_path2       age1       age2  \
0  /content/utkface_final/utkface_aligned_cropped...  44.116760  26.153221   
1  /content/utkface_final/utkface_aligned_cropped...  47.330673  21.295252   
2  /content/utkface_final/utkface_aligned_cropped...  31.084494  45.400433   
3  /content/utkface_final/utkface_aligned_cropped...  38.734959  33.677563   
4  /content/utkface_final/utkface_aligned_cropped...  50.781651  30.622351   

                                          embedding1  \
0  [0.049472421407699585, 0.010554309003055096, 0...   
1  [0.003280659904703498, 0.012418401427567005, 0...   
2  [-0.044035427272319794,

In [24]:
# --- Step 3: Merge datasets ---
final_mlp_df = pd.concat([merged_df, utk_df], ignore_index=True)

# --- Step 4: Save locally ---
final_mlp_csv_path = '/content/final_mlp_csv.csv'
final_mlp_df.to_csv(final_mlp_csv_path, index=False)

# --- Step 5: Show head and tail ---
print("Head of final_mlp_df:")
print(final_mlp_df.head())
print("\nTail of final_mlp_df:")
print(final_mlp_df.tail())

# --- Step 6: Mount Google Drive and save there ---
drive.mount('/content/drive')

destination_path = '/content/drive/MyDrive/final_mlp_csv.csv'
shutil.copy(final_mlp_csv_path, destination_path)

print(f"\nFile saved to Google Drive at: {destination_path}")

Head of final_mlp_df:
                                         image_path1  \
0  MLP_dataset/Positive_FGNET/person001/person001...   
1  MLP_dataset/Positive_FGNET/person001/person001...   
2  MLP_dataset/Positive_FGNET/person001/person001...   
3  MLP_dataset/Positive_FGNET/person001/person001...   
4  MLP_dataset/Positive_FGNET/person001/person001...   

                                         image_path2      age1       age2  \
0  MLP_dataset/Positive_FGNET/person001/person001...  4.660048  27.186808   
1  MLP_dataset/Positive_FGNET/person001/person001...  4.660048  20.907551   
2  MLP_dataset/Positive_FGNET/person001/person001...  4.660048  31.123674   
3  MLP_dataset/Positive_FGNET/person001/person001...  4.660048  23.976097   
4  MLP_dataset/Positive_FGNET/person001/person001...  4.660048  34.156021   

                                          embedding1  \
0  [-2.14421488e-02  4.52129766e-02  2.54299995e-...   
1  [-2.14421488e-02  4.52129766e-02  2.54299995e-...   
2  [-2.144