# Bias Analysis

In [1]:
# Run to make the plots pop out of the .ipynb file
%matplotlib qt

In [None]:
# Run to make the plots stay in the .ipynb file
%matplotlib inline 

## Importing data/Setting up

In [1]:
# Imports
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Read the csv files
norms_df = pd.read_csv("./Data/SMID_norms.csv")
# Rename columns accordingly
norms_df.rename(columns={'Unnamed: 0': 'ID'}, inplace=True)

In [17]:
with open("./Data/SMID_regions_IDs.pickle", 'rb') as handle:
    afr_ids,as_ids,eur_ids,me_ids = pickle.load(handle)
afr_mask = norms_df['ID'].isin(afr_ids)
as_mask = norms_df['ID'].isin(as_ids)
eur_mask = norms_df['ID'].isin(eur_ids)
me_mask = norms_df['ID'].isin(me_ids)

In [18]:
sum(afr_mask)

787

In [19]:
sum(as_mask)

915

In [20]:
sum(eur_mask)

841

In [21]:
sum(me_mask)

644

In [None]:
fig, axs = plt.subplots(2, 3)
col_names = [#"valence_mean", "arousal_mean", 
             "moral_mean", "authority_mean",
             "fairness_mean", "harm_mean",
             "ingroup_mean", "purity_mean"]
plt_names = [#"Valence", "Arousal", 
             "Morality", "Authority (Relevance)",
             "Fairness (Relevance)", "Care (Relevance)",
             "Ingroup (Relevance)", "Purity (Relevance)"]

afr = norms_df[afr_mask]
afr['Region'] = 'Africa'
asia = norms_df[as_mask]
asia['Region'] = 'Asia'
eur = norms_df[eur_mask]
eur['Region'] = 'Europe'
me = norms_df[me_mask]
me['Region'] = 'Middle East'
regions_df = pd.concat([afr,asia,eur,me])

for i in range(6):
    sns.boxplot(y=col_names[i], x= "Region", data=regions_df,  orient='v' , ax=axs[i//3, i%3])
    axs[i//3, i%3].set_ylabel('Image Normative Ratings')
    axs[i//3, i%3].set_title(plt_names[i])

fig.suptitle("Distribution of Ratings for Different Regions of SMID")
plt.show()

## Stat testing

In [5]:
np.random.seed(15654)
col_names = ["valence_mean", "arousal_mean", 
             "moral_mean", "authority_mean",
             "fairness_mean", "harm_mean",
             "ingroup_mean", "purity_mean"]

mask = afr_mask
arr = []
for i in range(2,8):
    df = norms_df[mask][col_names[i]]
    arr.append(np.array([np.mean(np.random.choice(df, 100, replace = True)) for _ in range(10000)]))
arr=np.stack(arr)
afr_arr = arr

mask = as_mask
arr = []
for i in range(2,8):
    df = norms_df[mask][col_names[i]]
    arr.append(np.array([np.mean(np.random.choice(df, 100, replace = True)) for _ in range(10000)]))
arr=np.stack(arr)
as_arr = arr

mask = eur_mask
arr = []
for i in range(2,8):
    df = norms_df[mask][col_names[i]]
    arr.append(np.array([np.mean(np.random.choice(df, 100, replace = True)) for _ in range(10000)]))
arr=np.stack(arr)
eur_arr = arr

mask = me_mask
arr = []
for i in range(2,8):
    df = norms_df[mask][col_names[i]]
    arr.append(np.array([np.mean(np.random.choice(df, 100, replace = True)) for _ in range(10000)]))
arr=np.stack(arr)
me_arr = arr

In [12]:
variable_index = 3 # Care
test_arr = afr_arr
other_arrs = [as_arr, eur_mask]

cum_arr = True
for arr in other_arrs:
    cum_arr=np.logical_and(cum_arr, (test_arr[variable_index])>(arr[variable_index]))
print(1-np.sum(cum_arr)/10000)

0.030399999999999983


In [13]:
variable_index = 3 # Care
test_arr = me_arr
# [spo_arr, bus_arr, tec_arr, sci_arr, hea_arr]
other_arrs = [as_arr, eur_mask]

cum_arr = True
for arr in other_arrs:
    cum_arr=np.logical_and(cum_arr, (test_arr[variable_index])>(arr[variable_index]))
print(1-np.sum(cum_arr)/10000)

0.20589999999999997


In [14]:
variable_index = 5 # Purity
test_arr = afr_arr
# [spo_arr, bus_arr, tec_arr, sci_arr, hea_arr]
other_arrs = [as_arr, eur_mask]

cum_arr = True
for arr in other_arrs:
    cum_arr=np.logical_and(cum_arr, (test_arr[variable_index])>(arr[variable_index]))
print(1-np.sum(cum_arr)/10000)

0.07120000000000004


In [15]:
variable_index = 5 # Purity
test_arr = me_arr
# [spo_arr, bus_arr, tec_arr, sci_arr, hea_arr]
other_arrs = [as_arr, eur_mask]

cum_arr = True
for arr in other_arrs:
    cum_arr=np.logical_and(cum_arr, (test_arr[variable_index])>(arr[variable_index]))
print(1-np.sum(cum_arr)/10000)

0.15259999999999996
