In [1]:
import pandas as pd
from tqdm import tqdm

path = "/gscratch/jamiemmt/andersonlee/bias-mitigation/drdj/pairs/celebA_md.csv"
df = pd.read_csv(path, index_col=0)

print(len(df))
aux_agreement = df.loc[df["aux_1"] == df["aux_2"]]
label_agreement = df.loc[df["label_1"] == df["label_2"]]
group_agreement = df.loc[(df["aux_1"] == df["aux_2"]) & (df["label_1"] == df["label_2"])]
print(len(aux_agreement) / len(df))
print(len(label_agreement) / len(df))
print(len(group_agreement) / len(df))

150873
0.9783526542191114
0.9334871050486171
0.913490153970558


# Pairs Distribution

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

def get_unbalanced_distribution(plot=False, num_groups=20):
    np.random.seed(1)
    # dist_first_half = np.random.uniform(low=0.1, high=0.5, size=num_classes//2)
    # dist_second_half = np.random.uniform(low=0.5, high=1.0, size=num_classes//2)
    dist_first_half = np.ones(num_groups//2) * 0.2
    dist_second_half = np.ones(num_groups//2) * 0.8
    dist = np.concatenate([dist_first_half, dist_second_half])
    if plot:
        fig = plt.figure(figsize=(30, 10))
        plt.bar(x=range(len(dist)), height=dist, color=["green" for i in range(10)] + ["blue" for i in range(10)])
    return dist / sum(dist)

def get_pairs_sample_distribution(df, plot=False, sample="P"):
    group_cnt = np.zeros(20)
    for i in range(len(df)):
        group = int(df.loc[i, "aux_1"])
        group_cnt[group] += 1
    group_dist = group_cnt / len(df)
    if plot:
        fig = plt.figure(figsize=(30, 10))
        plt.bar(x=range(20), height=group_dist)
    return group_dist

pairs_path = "/gscratch/cse/lee0618/cifar-100/meta/pairs.csv"
unbalanced_pairs_path = "/gscratch/cse/lee0618/cifar-100/meta/unb_pairs.csv"

get_pairs_sample_distribution(pd.read_csv(pairs_path, index_col=0), plot=True)
unbalanced_pairs_dist = get_pairs_sample_distribution(pd.read_csv(unbalanced_pairs_path, index_col=0))
sample_dist = get_unbalanced_distribution()
width = 0.4
x = np.arange(20)
fig, ax = plt.subplots(layout="constrained", figsize=(30, 10))
rects = ax.bar(x=x, 
              height=unbalanced_pairs_dist, 
              width=width, 
              label="Pairs distribution")
ax.bar_label(rects, padding=3)
rects = ax.bar(x=x+width,
               height=sample_dist,
               width=width,
               label="Sample distribution")
ax.bar_label(rects, padding=3)
ax.legend(loc='upper left', prop={'size': 25})

# Correlation between distance and pair agreement

In [5]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt

path = "/gscratch/cse/lee0618/cifar-100/meta/pairs.csv"
df = pd.read_csv(path, index_col=0)

def plot_err_bar(agreement, dist, plot=True):
    """
    Plot the correlation between distance and label agreement
    Return agreement rate

    :param np.ndarray agreement: A 1-0 numpy array indicating agreement boolean
    :param np.ndarray dist: A numpy array with distances with the same length of agreement
    """
    agree_dist = dist.compress(agreement, axis=0)
    disagree_dist = dist.compress(1-agreement, axis=0)
    # print(f"Correlation = {cor[0][1]}")
    if plot:
        fig = plt.figure(figsize=(5, 8))
        y_means = np.array([np.mean(disagree_dist), np.mean(agree_dist)])
        y_stds = np.array([np.std(disagree_dist), np.std(agree_dist)])
        plt.scatter(x=np.zeros(len(disagree_dist)), y=disagree_dist, alpha=0.5, facecolor='none', s=30, edgecolor='tab:orange')
        plt.scatter(x=np.ones(len(agree_dist)), y=agree_dist, alpha=0.5, facecolor='none', s=30, edgecolor='tab:blue')
        plt.errorbar([0, 1], y_means, yerr=y_stds, fmt='s', capsize=10, color="black")
        plt.xticks([0, 1], ['Disagree', 'Agree'])
        plt.ylabel('Distance')
        plt.xlim(-0.5, 1.5)
        plt.grid(True)
    # print(f"Agreement rate: {np.sum(agreement) / len(agreement)}")
    return (np.sum(agreement) / len(agreement))

def correlation(agreement, dist):
    cor = np.corrcoef(x=np.stack([dist, agreement]))
    return cor[0][1]

def group_agreement_rate(df, group):
    group_df = df.loc[df["aux_1"] == group]
    agreement = np.array(group_df["aux_1"] == group_df["aux_2"]).astype(int)
    dist = np.array(group_df["dist"])
    r = plot_err_bar(agreement, dist, plot=False)
    c = correlation(agreement, dist)
    return c, r

def group_max_dist(df, group):
    group_df = df.loc[df["aux_1"] == group]
    return np.array(group_df["dist"]).max()

def filter_by_dist(df, dist):
    sub_df = df.loc[df["dist"] < dist]
    return pd.DataFrame(sub_df)

# label_agreement = np.array(df["label_1"] == df["label_2"]).astype(int)
# dist = np.array(df["dist"])
# plot_err_bar(label_agreement, dist)
corrs = []
agreement_rates = []
for i in range(20):
    c, r = group_agreement_rate(df, i)
    corrs.append(c)
    agreement_rates.append(r)
cor = np.corrcoef(x=np.stack([np.array(corrs), np.array(agreement_rates)]))
print(f"Correlation = {cor[0][1]}")

Correlation = -0.8263033192551323
