In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import functools
from scipy.spatial import distance
from scipy import stats

## Competition 1:
$$ S \sim \text{Bernoulli}(0.5) $$
$$ R \sim \text{Bernoulli}(0.5) $$
$$ $$
$$ X_1 | S=1 \text{ (Women)} \sim 20*\text{Beta}(2,7) $$
$$ X_1 | S=0 \text{ (Men)} \sim 20*\text{Beta}(2,2) $$
$$ $$ 
$$ X_2 | R=1 \text{ (Black)} \sim 20*\text{Beta}(2,5) $$
$$ X_2 | R=0 \text{ (White)} \sim 20*\text{Beta}(2,2) $$
$$ $$
$$ X_3 \sim \text{Normal}(20,1) $$
$$ $$
$$ z_1 = 1/ (1 + exp-(0.1*X_1 + 0.1*X_2 + 0.1*X_3 - \text{Normal}(5,0.5)) $$
$$ Y_1 | X_1, X_2, X_3 \sim 2*\text{Bernoulli}(z_1) - 1 $$

In [2]:
n_samples = 2000

In [3]:
S = np.random.binomial(n=1, p=0.5, size=n_samples)
R = np.random.binomial(n=1, p=0.5, size=n_samples)

In [4]:
def get_beta_sample(a, b):
    return np.random.beta(a, b)

In [5]:
data = pd.DataFrame({"Sex": S, "Race": R})

In [6]:
x1 = []
for s in S:
    var = 20*get_beta_sample(2, 7) if s == 1 else 20*get_beta_sample(2, 2)
    x1.append(var)

len(x1) == n_samples

True

In [7]:
x2 = []
for r in R:
    var = 20*get_beta_sample(1, 3) if s == 1 else 20*get_beta_sample(2, 2)
    x2.append(var)
    
len(x2) == n_samples

True

In [8]:
#x3 = np.random.normal(20, 1, n_samples)

In [9]:
#z1 = [1/(1+ np.exp(-(0.1*x1[i] + 0.1*x2[i] + 0.1*x3[i] - np.random.normal(5, 0.5)))) for i in range(n_samples)]
#min(z1), max(z1)

In [10]:
min(x1), max(x1)

(0.0930053769695751, 19.66145962596371)

In [11]:
z1 = [1/(1+ np.exp(-(0.4*x1[i] + 0.4*x2[i] - np.random.normal(5, 0.5)))) for i in range(n_samples)]
min(z1), max(z1), np.array(z1).mean()

(0.0040272141782478595, 0.9999175618926722, 0.4650926410865901)

In [12]:
y1 = [np.random.binomial(1, z1[i]) for i in range(n_samples)]

In [13]:
data["X1"] = x1
data["X2"] = x2
#data["X3"] = x3
data["y1"] = y1

In [14]:
print("Overall:", data.loc[data.y1 == 1].shape[0]/ data.shape[0])
print("Women:", data.loc[(data.Sex == 1) & (data.y1 == 1)].shape[0] /data.loc[(data.Sex == 1)].shape[0])
print("Blacks:", data.loc[(data.Race == 1)& (data.y1 == 1)].shape[0] /data.loc[(data.Race == 1)].shape[0])
print("Black women:", data.loc[(data.Race == 1) & (data.Sex == 1) & (data.y1 == 1)].shape[0]/ data.loc[(data.Race == 1)&(data.Sex == 1)].shape[0])

Overall: 0.458
Women: 0.30452261306532663
Blacks: 0.4603174603174603
Black women: 0.30952380952380953


In [15]:
data.to_csv("Competition1.csv")

## Competition 2:
$$ X_4 \sim \text{Poisson}(25 + 10*Y_1)$$
<br>
$$ X_5 \sim \text{Normal}(20,1) $$
$$ $$
$$ X_6 | S=1 \text{ (Women)}, Y_1=1 \sim 20*\text{Beta}(2,2) $$
$$ X_6 | S=1 \text{ (Women)}, Y_1=0 \sim 20*\text{Beta}(1,3) $$
$$ X_6 | S=0 \text{ (Men)}, Y_1=1 \sim 20*\text{Beta}(2,2) $$
$$ X_6 | S=0 \text{ (Men)}, Y_1=0 \sim 20*\text{Beta}(2,7) $$
$$ $$ 
$$ X_7 | R=1 \text{ (Black)}, Y_1=1 \sim 20*\text{Beta}(2,5) $$
$$ X_7 | R=1 \text{ (Black)}, Y_1=0 \sim 20*\text{Beta}(1,3) $$
$$ X_7 | R=0 \text{ (White)}, Y_1=1 \sim 20*\text{Beta}(2,2) $$
$$ X_7 | R=0 \text{ (White)}, Y_1=0 \sim 20*\text{Beta}(2,5) $$
<br>
$$ z_2 = 1/ (1 + exp-(0.25*X_4 + 0.25*X_5 + 0.25*X_6 + 0.25*X_7 - \text{Normal}(5,0.5)) $$
$$ Y_2 | X_4, X_5, X_6, X_7 \sim 2*\text{Bernoulli}(z_2) - 1 $$

In [16]:
x4 = [np.random.poisson(25+ 10*y1[i])-np.random.normal(5, 1) for i in range(n_samples)]
len(x4) == n_samples

True

In [17]:
#x5 = np.random.normal(20, 1, n_samples)

In [18]:
x6 = []

for i in range(n_samples):
    if y1[i] == 1:
        var = 30*get_beta_sample(2, 2)
    else:
        var = 20*get_beta_sample(1, 3) if S[i] == 1 else 20*get_beta_sample(2, 7)
    x6.append(var)

len(x6) == n_samples

True

In [19]:
x7 = []

for i in range(n_samples):
    if y1[i] == 1:
        var = 30*get_beta_sample(2, 5) if R[i] == 1 else 25*get_beta_sample(2, 2)
    else:
        var = 20*get_beta_sample(1, 3) if R[i] == 1 else 20*get_beta_sample(2, 5)
    x7.append(var)

len(x7) == n_samples

True

In [20]:
#z2 = [1/(1+ np.exp(-(0.1*x4[i] + 0.1*x5[i] + 0.1*x6[i] + 0.1*x7[i] - np.random.normal(5, 0.5)))) for i in range(n_samples)]
#min(z2), max(z2)

In [21]:
z2 = [1/(1+ np.exp(-(0.1*x4[i] + 0.1*x6[i] + 0.1*x7[i] - np.random.normal(5, 0.5)))) for i in range(n_samples)]
min(z2), max(z2), np.array(z2).mean()

(0.006767962290505662, 0.9805342857472978, 0.35218963365684747)

In [22]:
y2 = [np.random.binomial(1, z2[i]) for i in range(n_samples)]

In [23]:
data["X4"] = x4
#data["X5"] = x5
data["X6"] = x6
data["X7"] = x7
data["y2"] = y2

In [24]:
print("Overall:", data.loc[data.y2 == 1].shape[0]/ data.shape[0])
print("Women:", data.loc[(data.Sex == 1) & (data.y2 == 1)].shape[0] /data.loc[(data.Sex == 1)].shape[0])
print("Blacks:", data.loc[(data.Race == 1)& (data.y2 == 1)].shape[0] /data.loc[(data.Race == 1)].shape[0])
print("Black women:", data.loc[(data.Race == 1) & (data.Sex == 1) & (data.y2 == 1)].shape[0]/ data.loc[(data.Race == 1)&(data.Sex == 1)].shape[0])

Overall: 0.348
Women: 0.28542713567839195
Blacks: 0.3333333333333333
Black women: 0.27380952380952384


In [25]:
data.to_csv("Competition2.csv")

In [28]:
data.loc[(data.y1 == 1)]

Unnamed: 0,Sex,Race,X1,X2,y1,X4,X6,X7,y2
1,0,0,13.318877,11.248242,1,41.725242,7.951331,13.749312,1
3,0,0,12.091859,10.607076,1,42.165962,13.483690,21.490307,1
6,1,1,10.002904,17.781121,1,23.536324,13.325771,7.865275,1
9,0,1,12.218116,2.736828,1,35.973935,8.726845,7.521754,0
15,0,0,14.488262,9.972409,1,31.626365,10.298990,7.172868,1
...,...,...,...,...,...,...,...,...,...
1989,0,1,1.170663,17.251484,1,15.706612,15.519588,14.060155,0
1992,1,0,2.303783,14.201725,1,26.141097,20.127640,19.979139,0
1996,0,0,12.792416,0.328122,1,26.721864,17.475322,22.114841,1
1997,1,0,8.121147,2.608974,1,32.719034,3.583132,18.817143,1
