Going to try a few things on a smaller scale 

Smaller Scale - Use Rosie Jupyter Notebooks T4 GPUS fine 
- filter out all the images with less than 30 occurances in the label set (to ensure amply training labels)
- Try using a batch like 1000 images and 5 classes to see if I get any performance at all (accuracy above 50%)
- Scale up slightly to 100 classes and see if performance remains 

Large Scale - Use command line DGx and sbash script to run the job 
- Try to run the scaled version on DGx with Rosie and Tensorboard as well as manual logging for viewing data runs

In [None]:
# System Imports
import os
import subprocess
import shutil

# Pre Processing Imports 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split 
import numpy as np 
import pandas as pd

# Deep Learning Imports 
import tensorflow as tf
from tensorflow import keras

from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

from keras.applications.resnet50 import ResNet50, preprocess_input
from keras.optimizers import Adam

from keras.applications import ResNet50
from keras.layers import GlobalAveragePooling2D, Dense
from keras.models import Model

from keras.callbacks import TensorBoard

In [None]:
# Load the identity file into a DataFrame
df = pd.read_csv('data/identity_CelebA.txt', delim_whitespace=True, header=None, names=['filename', 'label'])

# Count the occurrences of each label
label_counts = df['label'].value_counts()

# Identify labels with only one occurrence
single_count_labels = label_counts[label_counts == 1]
print(f'Number of labels with only one occurrence: {len(single_count_labels)}')

# Remove rows where labels appear less than 30x (otherwise stratification doesn't work well causing class imbalances)
df_filtered = df[df['label'].map(label_counts) > 30]

# Count the occurrences of each label
label_counts = df_filtered['label'].value_counts()
print(label_counts)

# create train test splits based on the txt file (containing file names and labels)
train_df, test_df = train_test_split(df_filtered, test_size=0.2, stratify=df_filtered['label'])

# Convert label column to string - req for downstream datagenerators to one-hot encode them
train_df['label'] = train_df['label'].astype(str)
test_df['label'] = test_df['label'].astype(str)

print(f"Unique classes in training set: {train_df['label'].nunique()}")
print(f"Unique classes in testing set: {test_df['label'].nunique()}")


In [None]:
# Move the actual images to the respective folders based on the train-test split of the labels 
def copy_images(df, source_dir, target_dir):
    os.makedirs(target_dir, exist_ok=True)  # Ensure target directory exists
    for filename in df['filename']:
        shutil.copy(os.path.join(source_dir, filename), os.path.join(target_dir, filename))

source_directory = 'data/img_align_celeba'
train_directory = 'data/train'
test_directory = 'data/test'

# Move train and test images
copy_images(train_df, source_directory, train_directory)
copy_images(test_df, source_directory, test_directory)

In [None]:
# Data generators
train_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)
test_datagen = ImageDataGenerator(rescale=1./255)

# Train Generator - updated class_mode to 'categorical'
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=train_directory,
    x_col='filename',
    y_col='label',
    target_size=(224, 224),  # Resizes all images to 224x224 (higher resolution)
    batch_size=32,
    class_mode='categorical',  
    subset='training'
)

# Test Generator - also 'categorical'
test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    directory=test_directory,
    x_col='filename',
    y_col='label',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)

# Print the shapes of input images and labels from the generator
inputs, labels = next(train_generator)
print('Input batch shape:', inputs.shape)
print('Label batch shape:', labels.shape)

In [None]:
# Baseline Model - Simple CNN

num_classes = 10133 # Number of classes

# Model Definition
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    MaxPooling2D(2, 2),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')  # match number of classes = 10,177 persons
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Create a TensorBoard callback
tensorboard_base_model = TensorBoard(log_dir='./logs/base_model', histogram_freq=1)

# Model Training
history = model.fit(
    train_generator,
    epochs=10,  
    validation_data=test_generator,  # Validation data to evaluate the model
    callbacks=[tensorboard_base_model]
)

# Model Evaluation
test_loss, test_accuracy = model.evaluate(test_generator)
print("Test accuracy:", test_accuracy)

In [None]:
# Advanced Model v1 - Resnet Architecture - random weights

# Model Definition
base_model = ResNet50(weights=None, include_top=False, input_shape=(224, 224, 3))

x = base_model.output 
x = GlobalAveragePooling2D()(x)  # Adds a global spatial average pooling layer
x = Dense(1024, activation='relu')(x)  # Add a fully-connected layer
predictions = Dense(10133, activation='softmax')(x)  # Output layer for 10,133 classes

model = Model(inputs=base_model.input, outputs=predictions)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) # Compile the model

model.summary() # Summary of the model

# Create a TensorBoard callback
tensorboard_resnet_random_weights = TensorBoard(log_dir='./logs/resnet_random_weights', histogram_freq=1)


# Model Training

history = model.fit(
    train_generator,
    steps_per_epoch=len(train_generator),
    epochs=10,
    validation_data=test_generator,
    validation_steps=len(test_generator),
    callbacks=[tensorboard_resnet_random_weights]
)

# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

scores = model.evaluate(test_generator, steps=len(test_generator))
print(f"Test Accuracy: {scores[1]*100}%")