This is the final project "sandbox" for CSC6621 - Arsalon's Version


In [8]:
# System Imports
import os
import subprocess
import shutil
from pyunpack import Archive

# Pre Processing Imports 
import matplotlib.pyplot as plt # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
import numpy as np # type: ignore
import pandas as pd

# Deep Learning Imports 
import tensorflow as tf
from tensorflow import keras

from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout


from keras.applications import ResNet50
from keras.layers import GlobalAveragePooling2D, Dense
from keras.models import Model


In [2]:
# Define the path to the directory containing the split archive
# archive_directory = 'data/img_celeba.7z'

# Specify the first file of the split archive
# first_part_path = os.path.join(archive_directory, 'img_celeba.7z.001')

# Define the output directory for the extracted files
# output_directory = 'data/img_celeba_extracted'

# Extract the archive
'''
try:
    Archive(first_part_path).extractall(output_directory)
    print("Extraction successful.")
except Exception as e:
    print("Failed to extract:", e)

'''




Extraction successful.


In [None]:
'''


# Define the output directory from previous context
output_directory = 'data/img_celeba_extracted'

# Path to the directory where all images are stored
image_directory = os.path.join(output_directory, 'img_celeba')

# Define the train and test directories
train_dir = os.path.join(output_directory, 'train')
test_dir = os.path.join(output_directory, 'test')

# Create directories for train and test datasets if they don't exist
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Get all file names in the output directory
all_files = [os.path.join(image_directory, f) for f in os.listdir(image_directory) if os.path.isfile(os.path.join(image_directory, f))]

# Split files into train and test sets (80-20 split here)
train_files, test_files = train_test_split(all_files, test_size=0.2, random_state=42)

# Function to copy files to new directories
def copy_files(files, directory):
    for file in files:
        shutil.copy(file, directory)

# Copy files to respective directories
copy_files(train_files, train_dir)
copy_files(test_files, test_dir)

print("Files successfully split into training and testing directories.")

'''


In [None]:
# Define paths
train_dir = 'data/img_celeba_extracted/train'  
test_dir = 'data/img_celeba_extracted/test'    

# Load the labels dataset and ensure labels are string type
file_path = os.path.join(os.getcwd(), 'data', 'identity_CelebA.txt')
data = pd.read_csv(file_path, delim_whitespace=True, header=None, names=['filename', 'label'])
data['label'] = data['label'].astype(str)  # Convert labels to string to match expected format

# Check file existence for train, test, and validate subsets
data['train_exists'] = data['filename'].apply(lambda x: os.path.exists(os.path.join(train_dir, x)))
data['test_exists'] = data['filename'].apply(lambda x: os.path.exists(os.path.join(test_dir, x)))

# Separate DataFrames for train, test based on file existence
train_data = data[data['train_exists']]
test_data = data[data['test_exists']]

# Data generators
train_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)
test_datagen = ImageDataGenerator(rescale=1./255)

# Train Generator - updated class_mode to 'categorical'
train_generator = train_datagen.flow_from_dataframe(
    dataframe=data,
    directory=train_dir,
    x_col='filename',
    y_col='label',
    target_size=(224, 224),  # Resizes all images to 224x224 (higher resolution)
    batch_size=32,
    class_mode='categorical',  
    subset='training'
)

# Validation Generator (from train data) - also 'categorical'
validation_generator = train_datagen.flow_from_dataframe(
    dataframe=data,
    directory=train_dir,
    x_col='filename',
    y_col='label',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
)

# Test Generator - also 'categorical'
test_generator = test_datagen.flow_from_dataframe(
    dataframe=data,
    directory=test_dir,
    x_col='filename',
    y_col='label',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)

# Print the shapes of input images and labels from the generator
inputs, labels = next(train_generator)
print('Input batch shape:', inputs.shape)
print('Label batch shape:', labels.shape)


In [None]:
# Baseline Model - Simple CNN

num_classes = 10153 # Number of classes

model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    MaxPooling2D(2, 2),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')  # match number of classes = 10,177 persons
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    train_generator,
    epochs=10,  
    validation_data=test_generator  # Validation data to evaluate the model
)

# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(test_generator)
print("Test accuracy:", test_accuracy)

In [None]:
# Advanced Model v1 - Resnet Architecture

# Define the base ResNet50 model
base_model = ResNet50(weights=None, include_top=False, input_shape=(224, 224, 3))

# Add new layers on top
x = base_model.output
x = GlobalAveragePooling2D()(x)  # Adds a global spatial average pooling layer
x = Dense(1024, activation='relu')(x)  # Add a fully-connected layer
predictions = Dense(10177, activation='softmax')(x)  # Output layer for 10,177 classes

# This is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()