In [None]:
# Import necessary libraries
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import cv2

# Set random seed for reproducibility
np.random.seed(42)

# Define paths (adjust these based on your dataset location)
data_dir = "path_to_your_dataset"
image_dir = os.path.join(data_dir, "raw_images")
json_dir = os.path.join(data_dir, "converted_json")

# Load and preprocess data
def load_data(image_dir, json_dir):
    images = []
    ages = []
    genders = []
    
    for filename in os.listdir(image_dir):
        if filename.endswith(".jpg"):
            # Extract age and gender from filename (e.g., 42_16_0 -> age=16, gender=0)
            parts = filename.split("_")
            age = int(parts[1])
            gender = int(parts[2].split(".")[0])  # 0 for male, 1 for female (adjust if different)
            
            # Load image
            img_path = os.path.join(image_dir, filename)
            img = cv2.imread(img_path)
            img = cv2.resize(img, (128, 128))  # Resize to 128x128
            img = img / 255.0  # Normalize pixel values
            
            images.append(img)
            ages.append(age)
            genders.append(gender)
    
    return np.array(images), np.array(ages), np.array(genders)

# Load data
X, y_age, y_gender = load_data(image_dir, json_dir)

# Encode gender labels (if needed, though 0/1 is already suitable)
# No encoding needed if 0/1 is used directly

# Split the dataset
X_train, X_test, y_age_train, y_age_test, y_gender_train, y_gender_test = train_test_split(
    X, y_age, y_gender, test_size=0.2, random_state=42
)
X_train, X_val, y_age_train, y_age_val, y_gender_train, y_gender_val = train_test_split(
    X_train, y_age_train, y_gender_train, test_size=0.25, random_state=42  # 0.25 x 0.8 = 0.2
)

# Data augmentation
datagen = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)
datagen.fit(X_train)

# Build CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid', name='gender_output'),  # Binary gender output
    Dense(1, name='age_output')  # Age regression output
])

# Compile the model
model.compile(optimizer='adam',
              loss={'gender_output': 'binary_crossentropy', 'age_output': 'mse'},
              metrics={'gender_output': 'accuracy', 'age_output': 'mae'},
              loss_weights={'gender_output': 0.5, 'age_output': 0.5})

# Model summary
model.summary()

# Train the model
history = model.fit(
    datagen.flow(X_train, {'gender_output': y_gender_train, 'age_output': y_age_train}, batch_size=32),
    validation_data=(X_val, {'gender_output': y_gender_val, 'age_output': y_age_val}),
    epochs=50,
    verbose=1
)

# Evaluate the model
test_loss, test_gender_loss, test_age_loss, test_gender_acc, test_age_mae = model.evaluate(
    X_test, {'gender_output': y_gender_test, 'age_output': y_age_test}, verbose=0
)
print(f"Test Gender Accuracy: {test_gender_acc:.4f}")
print(f"Test Age MAE: {test_age_mae:.4f}")

# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['gender_output_accuracy'], label='Training Accuracy')
plt.plot(history.history['val_gender_output_accuracy'], label='Validation Accuracy')
plt.title('Gender Classification Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['age_output_mae'], label='Training MAE')
plt.plot(history.history['val_age_output_mae'], label='Validation MAE')
plt.title('Age Regression MAE')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.legend()

plt.tight_layout()
plt.show()

# Save the model
model.save('handwriting_cnn_model.h5')

# Example prediction on a test image
def predict_image(model, img_path):
    img = cv2.imread(img_path)
    img = cv2.resize(img, (128, 128))
    img = img / 255.0
    img = np.expand_dims(img, axis=0)
    pred = model.predict(img)
    gender_pred = "Male" if pred[0][0] > 0.5 else "Female"
    age_pred = pred[0][1]
    return gender_pred, age_pred

# Test prediction
sample_img = os.path.join(image_dir, os.listdir(image_dir)[0])
gender, age = predict_image(model, sample_img)
print(f"Predicted Gender: {gender}, Predicted Age: {age}")