# This notebook basically just converts the given datasets into csv format (for ease of use later on)

IHDP data can be downloaded from https://github.com/dmachlanski/CE888_2022/blob/main/project/data/ihdp.npz
JOBS data can be downloaded from https://github.com/dmachlanski/CE888_2022/blob/main/project/data/jobs.npz



In [1]:

import numpy as np
import pandas as pd
from typing import Dict

from assignment2.a2_utils import dataframe_utils


# Opening the IHDP dataset

In [2]:

ihdp_dict: Dict[str, np.ndarray] = dataframe_utils.npz_to_dict("../data/ihdp.npz", allow_pickle = False)
for k,v in ihdp_dict.items():
    print(f"{k}: {v.shape}")

ihdp_df_x: pd.DataFrame = dataframe_utils.x_to_dataframe(ihdp_dict['x'])
"""Dataframe holding the 'x' data for the IHDP dataset"""


ihdp_df: pd.DataFrame = dataframe_utils.add_everything_but_x_to_copy_of_dataframe(
    ihdp_df_x.copy(),
    ihdp_dict,
    "x"
)
"""Dataframe holding the entirety of the IHDP dataset"""

ihdp_df_t: pd.DataFrame = dataframe_utils.process_counterfactuals(
    ihdp_df.copy(),
    t="t",
    y_factual="yf",
    new_counterfactual_t="tcf",
    y_counterfactual="ycf",
    ite="ite",
    t0_name="t0",
    t1_name="t1"
)#ihdp_df.copy()
"Dataframe with clearly marked 't0' and 't1' outcomes for each individual"


#ihdp_df_t["t0"] = np.choose(ihdp_df["t"].values, [ihdp_df["yf"].values, ihdp_df["ycf"].values])
#ihdp_df_t["t1"] = np.choose(ihdp_df["t"].values, [ihdp_df["ycf"].values, ihdp_df["yf"].values])

#ihdp_df_t.attrs["not_01"] = (*ihdp_df_t.attrs["not_01"],  "t0","t1")
#ihdp_df_t.attrs["01"] = (*ihdp_df_t.attrs["01"],"tcf")

ihdp_df_t.head()

x: (747, 25)
t: (747, 1)
yf: (747, 1)
ycf: (747, 1)
ite: (747, 1)


Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x22,x23,x24,t,yf,ycf,ite,tcf,t0,t1
0,1.397395,0.996346,-1.105624,-0.879606,0.308569,-1.023402,1,0,0,0,...,0,0,1,1,4.771232,-0.298509,4.657928,0,-0.298509,4.771232
1,0.269033,0.196818,0.383828,0.161703,-0.629189,1.460832,1,0,1,0,...,0,0,0,0,2.956273,5.78377,3.428604,1,2.956273,5.78377
2,1.051537,1.795874,-1.105624,0.161703,-0.629189,0.963985,1,0,1,1,...,0,0,1,0,4.164164,7.055789,3.658195,1,4.164164,7.055789
3,0.662446,0.196818,-0.733261,-0.879606,0.371086,-0.692171,1,0,0,0,...,0,0,0,1,6.172307,1.379697,4.585505,0,1.379697,6.172307
4,0.856992,1.795874,0.011465,-0.879606,0.558638,0.301522,0,1,1,0,...,0,0,0,1,7.834469,2.747986,4.265591,0,2.747986,7.834469


In [3]:
ihdp_df_t.describe()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x22,x23,x24,t,yf,ycf,ite,tcf,t0,t1
count,747.0,747.0,747.0,747.0,747.0,747.0,747.0,747.0,747.0,747.0,...,747.0,747.0,747.0,747.0,747.0,747.0,747.0,747.0,747.0,747.0
mean,3.448082e-17,4.755975e-18,-2.377988e-17,-5.612051e-16,4.0425790000000004e-17,-7.633341e-16,0.514056,0.093708,0.52075,0.364123,...,0.073628,0.128514,0.157965,0.186078,3.159538,5.696107,4.016067,0.813922,2.412991,6.442653
std,1.0,1.0,1.0,1.0,1.0,1.0,0.500137,0.291618,0.499904,0.481506,...,0.261339,0.334886,0.364953,0.38943,2.179956,1.980121,0.859736,0.38943,1.596864,1.102065
min,-2.731287,-3.800823,-1.85035,-0.879606,-5.130428,-1.85148,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-1.543902,-1.037628,-1.866989,0.0,-1.543902,3.210085
25%,-0.6669461,-0.60271,-0.733261,-0.879606,-0.566672,-0.8577868,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.626779,5.053598,3.775936,1.0,1.448038,5.734262
50%,0.1652752,0.1968181,-0.360898,0.1617025,0.1210172,-0.02970882,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,2.577294,6.209686,4.304433,1.0,2.250801,6.452305
75%,0.8137593,0.5965822,0.756191,0.1617025,0.6836721,0.6327536,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,4.494637,6.948922,4.569454,1.0,3.113372,7.176373
max,1.505476,2.595403,2.990369,2.24432,2.371637,2.951372,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,11.268228,10.171004,4.670125,1.0,11.268228,10.171004


In [4]:
ihdp_df_t.to_csv(
    "ihdp_full.csv",
    index=False
)


# Opening the JOBS dataset


In [5]:
jobs_dict: Dict[str, np.ndarray] = dataframe_utils.npz_to_dict("../data/jobs.npz", allow_pickle=False)


for k,v in jobs_dict.items():
    print(f"{k}: {v.shape}")


jobs_df_x: pd.DataFrame = dataframe_utils.x_to_dataframe(jobs_dict['x'])
"Dataframe containing only the X values of jobs"

jobs_df: pd.DataFrame = dataframe_utils.add_everything_but_x_to_copy_of_dataframe(
    jobs_df_x.copy(),
    jobs_dict,
    "x"
)
"Dataframe for the full jobs dataset"


jobs_df: pd.DataFrame = dataframe_utils.process_counterfactuals(
    jobs_df,
    t="t",
    y_factual="yf",
    new_counterfactual_t="tcf",
    y_counterfactual=None,
    ite=None,
    t0_name=None,
    t1_name=None
)

jobs_df.head()

x: (3212, 17)
t: (3212, 1)
y: (3212, 1)
e: (3212, 1)


Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x11,x12,x13,x14,x15,x16,t,y,e,tcf
0,-0.614282,1.464727,0,0,1,0,2.39325,2.746196,-0.653311,-0.656913,...,2.462337,2.937244,0,0,2.843909,0,0,1,0,1
1,-0.802463,0.101835,0,0,1,0,0.109885,0.498271,-0.785284,-0.743407,...,-0.177193,0.082537,0,0,0.038422,0,0,1,0,1
2,-0.896553,-0.238888,1,0,1,1,-0.085212,-0.148097,-0.847312,-0.781606,...,-0.286221,-0.303615,0,0,-0.191304,0,0,1,0,1
3,-0.896553,-0.238888,0,0,0,1,0.405581,0.325594,-0.847312,-0.781606,...,0.02302,-0.03963,0,0,0.173108,0,0,1,1,1
4,0.13844,-1.601779,1,0,1,1,-0.722531,-0.212734,-0.01984,-0.156019,...,-0.514563,-0.331552,0,0,-0.779227,0,0,1,0,1


In [8]:
print(f"size of experiment group: {sum(jobs_df['e'])}")

jobs_df.describe()

size of experiment group: 722


Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x11,x12,x13,x14,x15,x16,t,y,e,tcf
count,3212.0,3212.0,3212.0,3212.0,3212.0,3212.0,3212.0,3212.0,3212.0,3212.0,...,3212.0,3212.0,3212.0,3212.0,3212.0,3212.0,3212.0,3212.0,3212.0,3212.0
mean,2.013057e-16,-1.880328e-16,0.374222,0.048879,0.70797,0.411893,3.3182260000000004e-17,-9.069817000000001e-17,-1.106075e-17,1.763748e-14,...,-3.539441e-17,2.035178e-16,0.168742,0.167497,1.283047e-16,0.010897,0.092466,0.849938,0.224782,0.907534
std,1.0,1.0,0.483997,0.215649,0.454767,0.492253,1.0,1.0,1.0,1.0,...,1.0,1.0,0.374582,0.373477,1.0,0.103833,0.289727,0.357188,0.417504,0.289727
min,-1.461095,-3.98684,0.0,0.0,0.0,0.0,-1.147411,-1.110467,-1.164047,-0.9488316,...,-0.5580568,-0.5188151,0.0,0.0,-0.9799811,0.0,0.0,0.0,0.0,0.0
25%,-0.8024627,-0.5796108,0.0,0.0,0.0,0.0,-0.8923385,-0.9139141,-0.7852843,-0.7434072,...,-0.5423814,-0.509838,0.0,0.0,-0.8142367,0.0,0.0,1.0,0.0,1.0
50%,-0.3320111,0.101835,0.0,0.0,1.0,0.0,-0.08521244,-0.08346011,-0.4355556,-0.4998533,...,-0.2862214,-0.2737369,0.0,0.0,-0.203502,0.0,0.0,1.0,0.0,1.0
75%,0.7970728,0.4425579,1.0,0.0,1.0,1.0,0.5506613,0.5629081,0.6730184,0.5267384,...,0.1366588,0.1318304,0.0,0.0,0.4539777,0.0,0.0,1.0,0.0,1.0
max,2.114337,1.805449,1.0,1.0,1.0,1.0,8.751987,10.16507,2.446737,2.773336,...,23.05284,29.02264,1.0,1.0,7.13603,1.0,1.0,1.0,1.0,1.0


In [7]:
jobs_df.to_csv(
    "jobs_full.csv",
    index=False
)