## Data Pipeline for Quantifying and Mitigating Bias in CelebA Dataset

In [1]:
pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np

In [3]:
from cuda_available import check_cuda_availaibility
from quantify_bias import compute_metrics, count_attributes
from read_data_image_files import read_data_file, preprocess_data, read_image_data, image_standardization
from multilabel_classifier import train_val_test_split, build_model, refit_model, evaluate_model, make_predictions, analyze_failed_samples, save_over_sampling_data
from diffusion_model import define_stable_diffusion_model, generate_images_situational_prompt, generate_images_static_prompt

2023-08-24 21:50:04.583701: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-24 21:50:04.625520: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Driver (to run data pipeline)

In [4]:
def pre_data_augmentation():
    print("Prior to data augmention")
    
    ## pre-data augmentation
    directory = 'Data'
    file_path = 'protected_attributes_post_eda.csv'
    data = read_data_file(directory, file_path)
    print("Original data samples")
    print(data.head())
    
    print("Distribution of data")
    count_attributes(data)
    
    print("Bias quantification")
    print(compute_metrics(data))

    data_encoded = preprocess_data(data)
    
    file_path = 'celebA/img_align_celeba/img_align_celeba/'
    X, y = read_image_data(data_encoded, directory, file_path)
    X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split(X, y)

    model_name = 'vggnet'
    model = build_model(model_name, X_train, y_train, X_val, y_val, X_test, y_test)
    test_accuracy, precision, auc = evaluate_model(model, X_test, y_test)
    failed_samples, attribute_accuracies, attribute_precisions = make_predictions (model, X_test, y_test, data_encoded)
    print("Accuracy by attribute:", attribute_accuracies)
    
    data_file = 'non-protected_attributes_post_eda.csv'
    non_protected_data = read_data_file(directory, data_file)

    # Perform an inner join based on the 'image_id' column
    merged_df_all_attributes = pd.merge(data, non_protected_data, on='image_id')

    missclassified_samples = analyze_failed_samples (failed_samples, merged_df_all_attributes)

    directory = 'Data Generation-Stable Diffusion'
    save_over_sampling_data(model_name, directory, missclassified_samples)

In [5]:
def generate_samples(prompt_type):
    model_name = 'vggnet'
    directory = 'Data Generation-Stable Diffusion'
    file_path = f"miss_classified_image_samples_{model_name}.csv"
    # Read the CSV file
    data_miss_classified = read_data_file(directory, file_path)
    
    try:
        pipe = define_stable_diffusion_model()
        print(pipe)
        directory = 'Data'
        file_path = f'images_generated_stable_diffusion_{model_name}.csv'
        image_path = f'synthetic_data_augmentation_celebA_{model_name}'
        if(prompt_type == "situational"):
            generate_images_situational_prompt(data_miss_classified, pipe, directory, image_path, file_path)
        else:
            generate_images_static_prompt(data_miss_classified, pipe, directory, image_path, file_path)


    except MemoryError as mem_error:
        print("Memory error:", mem_error)
    except Exception as e:
        print("An error occurred:", e)
        


In [6]:
def post_data_augmentation():
    print("Post data augmention")
    
    ## post-data augmentation
    directory = 'Data'
    file_path = 'protected_attributes_post_eda.csv'
    data = read_data_file(directory, file_path)
    data_encoded = preprocess_data(data)
    
    model_name = 'vggnet'
    file_path = f'images_generated_stable_diffusion_{model_name}.csv'
    data1 = read_data_file(directory, file_path)
    data1.columns = ["image_id", "race", "gender", "emotion"]

    data = pd.concat([data, data1], ignore_index=True)
    data_encoded = preprocess_data(data)


    file_path = 'synthetic_data_augmentation_celebA_vgg_16/'
    X_generated, y_generated = read_image_data(data_encoded, directory, file_path)

    model = refit_model(X_generated, y_generated, X_val, y_val)
    test_accuracy, precision, auc = evaluate_model(model, X_test, y_test)
    failed_samples, attribute_accuracies, attribute_precisions = make_predictions (model, X_test, y_test, data_encoded)

In [7]:
pre_data_augmentation()

Prior to data augmention
Original data samples
     image_id             race gender emotion
0  009680.jpg  latino hispanic  Woman   happy
1  009028.jpg            white  Woman   happy
2  007702.jpg            white  Woman     sad
3  009681.jpg            white    Man   happy
4  010355.jpg            white  Woman     sad
Distribution of data
Categories and counts for race:
race
white              7027
asian               968
latino hispanic     953
black               724
middle eastern      584
Name: count, dtype: int64


Categories and counts for gender:
gender
Woman    5959
Man      4297
Name: count, dtype: int64


Categories and counts for emotion:
emotion
happy       4551
neutral     3120
sad         1183
fear         760
angry        471
surprise     140
disgust       31
Name: count, dtype: int64


Bias quantification
  Attribute  Shannon Diversity  Shannon Evenness  Simpson Diversity  \
0      race           1.052934          0.654225           2.019335   
1    gender           

100%|████████████████████████████████████| 10256/10256 [00:12<00:00, 841.81it/s]
2023-08-24 21:50:27.121782: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...



Test Accuracy: 0.8394160866737366

Test loss: 0.280304878950119

Precision: 0.9724770784378052

Auc: 0.8522663116455078
Accuracy by attribute: {'Attribute_image_id': 89.78102189781022, 'Attribute_race_asian': 92.45742092457421, 'Attribute_race_black': 90.51094890510949, 'Attribute_race_latino hispanic': 94.40389294403893, 'Attribute_race_middle eastern': 35.523114355231144, 'Attribute_race_white': 88.07785888077859, 'Attribute_gender_Man': 84.42822384428223, 'Attribute_gender_Woman': 95.62043795620438, 'Attribute_emotion_angry': 99.7566909975669, 'Attribute_emotion_disgust': 92.45742092457421, 'Attribute_emotion_fear': 58.63746958637469, 'Attribute_emotion_happy': 66.18004866180048, 'Attribute_emotion_neutral': 88.32116788321169, 'Attribute_emotion_sad': 99.02676399026764}


In [None]:
prompt = "situational" # can take values either static or situational
generate_samples(prompt)

In [None]:
post_data_augmentation()