# This notebook basically just converts the given datasets into csv format (for ease of use later on)

In [7]:

import numpy as np
import pandas as pd
from typing import Dict, Tuple, List, Union, Iterable, Optional
import doctest

from a2_utils import dataframe_utils


# Opening the IHDP dataset

In [8]:
ihdp_dict: Dict[str, np.ndarray] = dataframe_utils.npz_to_dict("../data/ihdp.npz", allow_pickle = False)
for k,v in ihdp_dict.items():
    print(f"{k}: {v.shape}")

ihdp_df_x: pd.DataFrame = dataframe_utils.x_to_dataframe(ihdp_dict['x'])
"""Dataframe holding the 'x' data for the IHDP dataset"""


ihdp_df: pd.DataFrame = dataframe_utils.add_everything_but_x_to_copy_of_dataframe(
    ihdp_df_x.copy(),
    ihdp_dict,
    "x"
)
"""Dataframe holding the entirety of the IHDP dataset"""

ihdp_df_t: pd.DataFrame = dataframe_utils.process_counterfactuals(
    ihdp_df.copy(),
    t="t",
    y_factual="yf",
    new_counterfactual_t="tcf",
    y_counterfactual="ycf",
    ite="ite",
    t0_name="t0",
    t1_name="t1"
)#ihdp_df.copy()
"Dataframe with clearly marked 't0' and 't1' outcomes for each individual"


#ihdp_df_t["t0"] = np.choose(ihdp_df["t"].values, [ihdp_df["yf"].values, ihdp_df["ycf"].values])
#ihdp_df_t["t1"] = np.choose(ihdp_df["t"].values, [ihdp_df["ycf"].values, ihdp_df["yf"].values])

#ihdp_df_t.attrs["not_01"] = (*ihdp_df_t.attrs["not_01"],  "t0","t1")
#ihdp_df_t.attrs["01"] = (*ihdp_df_t.attrs["01"],"tcf")

ihdp_df_t.head()

x: (747, 25)
t: (747, 1)
yf: (747, 1)
ycf: (747, 1)
ite: (747, 1)


Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x22,x23,x24,t,yf,ycf,ite,tcf,t0,t1
0,1.397395,0.996346,-1.105624,-0.879606,0.308569,-1.023402,1,0,0,0,...,0,0,1,1,4.771232,-0.298509,4.657928,0,-0.298509,4.771232
1,0.269033,0.196818,0.383828,0.161703,-0.629189,1.460832,1,0,1,0,...,0,0,0,0,2.956273,5.78377,3.428604,1,2.956273,5.78377
2,1.051537,1.795874,-1.105624,0.161703,-0.629189,0.963985,1,0,1,1,...,0,0,1,0,4.164164,7.055789,3.658195,1,4.164164,7.055789
3,0.662446,0.196818,-0.733261,-0.879606,0.371086,-0.692171,1,0,0,0,...,0,0,0,1,6.172307,1.379697,4.585505,0,1.379697,6.172307
4,0.856992,1.795874,0.011465,-0.879606,0.558638,0.301522,0,1,1,0,...,0,0,0,1,7.834469,2.747986,4.265591,0,2.747986,7.834469


In [9]:
ihdp_df_t.to_csv(
    "ihdp_full.csv",
    index=False
)


# Opening the JOBS dataset


In [10]:
jobs_dict: Dict[str, np.ndarray] = dataframe_utils.npz_to_dict("../data/jobs.npz", allow_pickle=False)


for k,v in jobs_dict.items():
    print(f"{k}: {v.shape}")


jobs_df_x: pd.DataFrame = dataframe_utils.x_to_dataframe(jobs_dict['x'])
"Dataframe containing only the X values of jobs"

jobs_df: pd.DataFrame = dataframe_utils.add_everything_but_x_to_copy_of_dataframe(
    jobs_df_x.copy(),
    jobs_dict,
    "x"
)
"Dataframe for the full jobs dataset"


jobs_df = dataframe_utils.process_counterfactuals(
    jobs_df,
    t="t",
    y_factual="yf",
    new_counterfactual_t="tcf",
    y_counterfactual=None,
    ite=None,
    t0_name=None,
    t1_name=None
)

jobs_df.head()

x: (3212, 17)
t: (3212, 1)
y: (3212, 1)
e: (3212, 1)


Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x11,x12,x13,x14,x15,x16,t,y,e,tcf
0,-0.614282,1.464727,0,0,1,0,2.39325,2.746196,-0.653311,-0.656913,...,2.462337,2.937244,0,0,2.843909,0,0,1,0,1
1,-0.802463,0.101835,0,0,1,0,0.109885,0.498271,-0.785284,-0.743407,...,-0.177193,0.082537,0,0,0.038422,0,0,1,0,1
2,-0.896553,-0.238888,1,0,1,1,-0.085212,-0.148097,-0.847312,-0.781606,...,-0.286221,-0.303615,0,0,-0.191304,0,0,1,0,1
3,-0.896553,-0.238888,0,0,0,1,0.405581,0.325594,-0.847312,-0.781606,...,0.02302,-0.03963,0,0,0.173108,0,0,1,1,1
4,0.13844,-1.601779,1,0,1,1,-0.722531,-0.212734,-0.01984,-0.156019,...,-0.514563,-0.331552,0,0,-0.779227,0,0,1,0,1


In [11]:
jobs_df.to_csv(
    "jobs_full.csv",
    index=False
)