In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import os
import sklearn.metrics
import matplotlib.pyplot as plt

tf.config.list_physical_devices()

In [None]:
image_size = (512, 512)
batch_size = 32

train_ds, val_ds = tf.keras.utils.image_dataset_from_directory(
    "H:/Photos/AI/Labeled",
    validation_split=0.2,
    subset="both",
    seed=327,
    image_size=image_size,
    batch_size=batch_size,
)

In [None]:
plt.figure(figsize=(10, 10))
for images, labels in train_ds.take(1):
    print(labels)
    for i in range(9):
        ax = plt.subplot(3, 3, i + 1)
        print(f"ax: {ax}")
        plt.imshow(images[i].numpy().astype("uint8"))
        plt.title(int(labels[i]))
        plt.axis("off")

In [None]:
data_augmentation = keras.Sequential(
    [
        layers.RandomFlip("horizontal"),
        layers.RandomRotation(0.1),
    ]
)

# Apply data_augmentation to the training images.
train_ds = train_ds.map(
    lambda img, label: (data_augmentation(img), label),
    num_parallel_calls=tf.data.AUTOTUNE,
)


# Prefetching samples in GPU memory helps maximize GPU utilization.
#train_ds = train_ds.prefetch(tf.data.AUTOTUNE)
#val_ds = val_ds.prefetch(tf.data.AUTOTUNE)

In [None]:
def make_model(input_shape, num_classes):
    inputs = keras.Input(shape=input_shape)

    # Entry block
    x = layers.Rescaling(1.0 / 255)(inputs)
    x = layers.Conv2D(128, 3, strides=2, padding="same")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)

    previous_block_activation = x  # Set aside residual

    for size in [256, 512, 728]:
        x = layers.Activation("relu")(x)
        x = layers.SeparableConv2D(size, 3, padding="same")(x)
        x = layers.BatchNormalization()(x)

        x = layers.Activation("relu")(x)
        x = layers.SeparableConv2D(size, 3, padding="same")(x)
        x = layers.BatchNormalization()(x)

        x = layers.MaxPooling2D(3, strides=2, padding="same")(x)

        # Project residual
        residual = layers.Conv2D(size, 1, strides=2, padding="same")(
            previous_block_activation
        )
        x = layers.add([x, residual])  # Add back residual
        previous_block_activation = x  # Set aside next residual

    x = layers.SeparableConv2D(1024, 3, padding="same")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)

    x = layers.GlobalAveragePooling2D()(x)
    if num_classes == 2:
        activation = "sigmoid"
        units = 1
    else:
        activation = "softmax"
        units = num_classes

    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(units, activation=activation)(x)
    return keras.Model(inputs, outputs)


model = make_model(input_shape=image_size + (3,), num_classes=2)
keras.utils.plot_model(model, show_shapes=True)
epochs = 75

callbacks = [
    keras.callbacks.ModelCheckpoint("H:/Photos/AI/Models/save_at_{epoch}.keras"),
]
model.compile(
    optimizer=keras.optimizers.Adam(1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy"],
)
model.fit(
    train_ds,
    epochs=epochs,
    callbacks=callbacks,
    validation_data=val_ds,
)
model = keras.models.load_model("H:/Photos/AI/Models/save_at_67.keras")


def predict_labeled_images(prediction_type):
    print(f"Predicting {prediction_type} images")
    if prediction_type == "train":
        data_path = "H:/Photos/AI/Labeled/"
        report_path = "H:/Photos/AI/Labeled/labeled_predictions.csv"
    elif prediction_type == "holdout":
        data_path = "H:/Photos/AI/Holdout/"
        report_path = "H:/Photos/AI/holdout_predictions.csv"
    else:
        raise ValueError("Invalid prediction_type. Expected 'train' or 'holdout'.")

    image_paths = [os.path.join(dp, f) for dp, dn, filenames in os.walk(data_path) for f in filenames if
                   os.path.splitext(f)[1] == '.jpeg']

    for this_file in image_paths:
        img = keras.utils.load_img(
            this_file, target_size=image_size
        )
        img_array = keras.utils.img_to_array(img)
        img_array = tf.expand_dims(img_array, 0)  # Create batch axis

        predictions = model.predict(img_array)

        if 'female' in this_file.lower():
            truth = 'Female'
        elif 'male' in this_file.lower():
            truth = 'Male'
        else:
            truth = 'Unknown'

        with open(report_path, "a") as myfile:
            myfile.write(f"{this_file},{predictions[0][0]},{truth}\n")


predict_labeled_images("train")
predict_labeled_images("holdout")
# Assess training predictions
try:
    image_size
except NameError:
    image_size = (512, 512)

train_predictions = pd.read_csv("H:/Photos/AI/Labeled/labeled_predictions.csv", header=None)
train_predictions.columns = ['file', 'prediction', 'truth']
train_predictions['prediction_class'] = train_predictions['prediction'].apply(lambda x: "Female" if x < 0.5 else "Male")
train_predictions['misclassified'] = train_predictions['truth'] != train_predictions['prediction_class']
train_predictions['misclassified'] = train_predictions['misclassified'].astype(int)

#Model metrics
print(sklearn.metrics.accuracy_score(y_true=train_predictions['truth'], y_pred=train_predictions['prediction_class']))
print(sklearn.metrics.confusion_matrix(y_true=train_predictions['truth'], y_pred=train_predictions['prediction_class']))
print(
    f"Brier score: {sklearn.metrics.brier_score_loss(y_true=train_predictions['truth'], y_prob=train_predictions['prediction'], pos_label='Male')}")
# Assess holdout predictions
holdout_predictions = pd.read_csv("H:/Photos/AI/holdout_predictions.csv", header=None)
holdout_predictions.columns = ['file', 'prediction', 'truth']
holdout_predictions['prediction_class'] = holdout_predictions['prediction'].apply(
    lambda x: "Female" if x < 0.5 else "Male")
holdout_predictions['prediction_confidence'] = (holdout_predictions['prediction'] - 0.5).abs()
holdout_predictions['misclassified'] = holdout_predictions['truth'] != holdout_predictions['prediction_class']
holdout_predictions['misclassified'] = holdout_predictions['misclassified'].astype(int)

#Model metrics
print(
    sklearn.metrics.accuracy_score(y_true=holdout_predictions['truth'], y_pred=holdout_predictions['prediction_class']))
print(sklearn.metrics.confusion_matrix(y_true=holdout_predictions['truth'],
                                       y_pred=holdout_predictions['prediction_class']))
print(
    f"Brier score: {sklearn.metrics.brier_score_loss(y_true=holdout_predictions['truth'], y_prob=holdout_predictions['prediction'], pos_label='Male')}")

# Find holdout images with inaccurate predictions or low confidence
# Move them to a separate folder for manual review
# TODO: implement this
holdout_review = holdout_predictions.query("0.4 < prediction < 0.6 or misclassified == 1").sort_values(
    by=['misclassified', 'prediction_confidence'], ascending=[False, True]).reset_index(drop=True)
holdout_review.to_csv("H:/Photos/AI/holdout_review.csv", index=False)

import shutil

# Target folder
target_folder = 'H:/Photos/AI/working_folder'

# Loop through the DataFrame and move the files
for index, row in holdout_review.iterrows():
    file_path = row['file']
    try:
        shutil.move(file_path, target_folder)
        print(f'Moved {file_path} to {target_folder}')
    except FileNotFoundError:
        print(f'File {file_path} not found')
    except Exception as e:
        print(f'Error moving {file_path}: {e}')

holdout_review
# Classify unlabeled images
try:
    image_size
except NameError:
    image_size = (512, 512)

model = keras.models.load_model("H:/Photos/AI/Models/save_at_67.keras")

unlabeled_directory = "H:/Photos/AI/Unlabeled/"
unlabeled_paths = [os.path.join(dp, f) for dp, dn, filenames in os.walk(unlabeled_directory) for f in filenames if
                   os.path.splitext(f)[1] == '.jpeg']

for this_file in unlabeled_paths:
    print(this_file)
    occupation = os.path.basename(this_file).split('_')[4]
    img = keras.utils.load_img(
        this_file, target_size=image_size
    )
    img_array = keras.utils.img_to_array(img)
    img_array = tf.expand_dims(img_array, 0)  # Create batch axis

    predictions = model.predict(img_array)
    if predictions[0][0] < 0.5:
        prediction_class = "Female"
    else:
        prediction_class = "Male"
    with open("H:/Photos/AI/tf_unlabeled_predictions.csv", "a") as myfile:
        myfile.write(f"{this_file},{occupation},{predictions[0][0]},{prediction_class}\n")
# Load unlabeled predictions
unlabeled_predictions = pd.read_csv("H:/Photos/AI/tf_unlabeled_predictions.csv", header=None)
unlabeled_predictions.columns = ['file', 'occupation', 'prediction', 'prediction_class']
unlabeled_predictions['certainty'] = unlabeled_predictions['prediction'].apply(lambda x: abs(x - 0.5))
plt.hist(unlabeled_predictions['prediction'], bins=20)
unlabeled_predictions
# Analyze gender depictions of different professions
unlabeled_predictions['prediction_class'].groupby(unlabeled_predictions['occupation']).value_counts(normalize=True)

