In [9]:
import pandas as pd
import numpy as np
import math
import os

In [10]:
# Function to compute Shannon Diversity and Evenness
def compute_shannon_diversity_and_evenness(p_values):
    S = len(p_values)
    H = -np.sum(p_values * np.log(p_values))
    E = H / np.log(S)
    return H, E

# Function to compute Simpson Diversity and Evenness
def compute_simpson_diversity_and_evenness(p_values):
    D = 1 / np.sum(p_values * p_values)
    E = D / len(p_values)
    return D, E


In [11]:
def compute_metrics(df):
    # List of attributes
    attributes = ['race', 'gender', 'emotion']

    # Initialize a list to store results
    results = []

    # Compute diversity and evenness for each attribute
    for attribute in attributes:
        p_values = df[attribute].value_counts(normalize=True).values
        shannon_diversity, shannon_evenness = compute_shannon_diversity_and_evenness(p_values)
        simpson_diversity, simpson_evenness = compute_simpson_diversity_and_evenness(p_values)

        result = {
            'Attribute': attribute,
            'Shannon Diversity': shannon_diversity,
            'Shannon Evenness': shannon_evenness,
            'Simpson Diversity': simpson_diversity,
            'Simpson Evenness': simpson_evenness
        }

        results.append(result)

    # Create a new DataFrame to store results
    results_df = pd.DataFrame(results)
    return results_df

In [12]:
def count_attributes(df):
    # List of attribute names
    attributes = ['race', 'gender', 'emotion']

    # Display categories and counts for each attribute
    for attribute in attributes:
        attribute_counts = df[attribute].value_counts()
        print(f"Categories and counts for {attribute}:")
        print(attribute_counts)
        print("\n")
    

### Reading Files

In [13]:
# Get the current working directory
cwd = os.getcwd()


# File paths for the two CSV files from different repositories
file_path1 = os.path.join(cwd, '..', 'Data', 'protected_attributes_post_eda.csv')
file_path2 = os.path.join(cwd, '..', 'Data','images_generated_stable_diffusion_vggnet.csv')
file_path3 = os.path.join(cwd, '..', 'Data Generation-Stable Diffusion','miss_classified_image_samples_vgg_net.csv')

# Read the CSV files into data frames
df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)
df3 = pd.read_csv(file_path3)
df1.columns = ["image_id", "race", "gender", "emotion"]
df2.columns = ["image_id", "race", "gender", "emotion"]
df1 = df1.drop(["image_id"], axis=1)

### Original dataset

In [14]:
count_attributes(df1)
print("Total samples:", len(df1))

Categories and counts for race:
race
white              7027
asian               968
latino hispanic     953
black               724
middle eastern      584
Name: count, dtype: int64


Categories and counts for gender:
gender
Woman    5959
Man      4297
Name: count, dtype: int64


Categories and counts for emotion:
emotion
happy       4551
neutral     3120
sad         1183
fear         760
angry        471
surprise     140
disgust       31
Name: count, dtype: int64


Total samples: 10256


In [15]:
results_df = compute_metrics(df1)

In [16]:
results_df

Unnamed: 0,Attribute,Shannon Diversity,Shannon Evenness,Simpson Diversity,Simpson Evenness
0,race,1.052934,0.654225,2.019335,0.403867
1,gender,0.679959,0.980973,1.948823,0.974411
2,emotion,1.382167,0.710293,3.220081,0.460012


### Synthetic Augmented dataset

### Miss classified image samples

In [17]:
df3 = df3.drop(["other_attributes", "count"], axis = 1)

In [18]:
count_attributes(df3)

Categories and counts for race:
race
white              706
latino hispanic    102
asian               99
black               64
middle eastern      46
Name: count, dtype: int64


Categories and counts for gender:
gender
Woman    586
Man      431
Name: count, dtype: int64


Categories and counts for emotion:
emotion
happy       433
neutral     330
sad         116
fear         78
angry        41
surprise     15
disgust       4
Name: count, dtype: int64




In [19]:
print("Total misclassified samples prior data augmentation:", len(df3))

Total misclassified samples prior data augmentation: 1017


In [20]:
results_df1 = compute_metrics(df3)

In [21]:
results_df1

Unnamed: 0,Attribute,Shannon Diversity,Shannon Evenness,Simpson Diversity,Simpson Evenness
0,race,1.02487,0.636787,1.970626,0.394125
1,gender,0.681488,0.983179,1.954598,0.977299
2,emotion,1.386764,0.712656,3.254006,0.464858


### Samples post data augmentation

In [22]:
df2.columns = ["image_id", "race", "gender", "emotion"]
df2 = df2.drop(["image_id"], axis=1)

In [23]:
count_attributes(df2)

Categories and counts for race:
race
white              2790
latino hispanic     390
black               111
asian                 8
Name: count, dtype: int64


Categories and counts for gender:
gender
Woman    2587
Man       712
Name: count, dtype: int64


Categories and counts for emotion:
emotion
neutral     1172
happy       1144
sad          594
fear         227
angry         90
surprise      66
disgust        6
Name: count, dtype: int64




In [24]:
results_df1 = compute_metrics(df2)

In [25]:
results_df1

Unnamed: 0,Attribute,Shannon Diversity,Shannon Evenness,Simpson Diversity,Simpson Evenness
0,race,0.522871,0.377172,1.369225,0.342306
1,gender,0.52157,0.752467,1.511686,0.755843
2,emotion,1.415774,0.727564,3.511707,0.501672


In [26]:
print("Total misclassified samples after data augmentation:", len(df2))

Total misclassified samples after data augmentation: 3299
