# This notebook basically just converts the given datasets into csv format (for ease of use later on)

In [4]:

import numpy as np
import pandas as pd
from typing import Dict, Tuple, List, Union, Iterable, Optional
import doctest


In [8]:
def npz_to_dict(
        npz_filename: str,
        **kwargs
) -> Dict[str, np.ndarray]:
    """
    Puts the given NPZ file into a dictionary of {table name, ndarray}.
    :param npz_filename: the filename of the NPZ file that we want to put into a dictionary.
    :param kwargs: kwargs from https://numpy.org/doc/stable/reference/generated/numpy.load.html#numpy.load.
    DO NOT INCLUDE A 'mmap_mode' KWARG!!!
    :return: The data from the given npz file in a dictionary.
    """
    data_dict: Dict[str, np.ndarray] = {}
    if kwargs is None:
        kwargs = {}
    with np.load(npz_filename, mmap_mode="r",**kwargs) as npz:
        for f in npz.files:
            data_dict[f] = npz[f]
    return data_dict


def turn_01_columns_into_int(
        dataframe_to_edit: pd.DataFrame,
) -> pd.DataFrame:
    """
    Finds all of the columns that just contain values of 0 and 1,
    and converts all of those columns to ints.

    Dataframe will have an '01' and 'not_01' attr added to it.
    Labels for series that only contain values 0 and 1 will be in the '01' tuple
    Labels for every other series will be in the 'not_01' tuple

    MODIFIES THE GIVEN DATAFRAME!
    :param dataframe_to_edit: the dataframe that is being edited
    :return: The modified dataframe.
    DOES NOT COPY THE GIVEN ORIGINAL DATAFRAME.

    >>> import pandas as pd
    >>> print(pd.__version__)
    1.4.1
    >>> before: pd.DataFrame = pd.DataFrame.from_dict(data={"int01":[0,1,1,0],"flt01":[0.0, 1.0, 0.0, 1.0], "intNo": [-1,0,1,2], "fltNo":[-1.0, 0.0, 1.0, 2.0], "intNan": [0,1,None,0], "fltNan":[0.0,1.0,None,0.0]})
    >>> before_types = before.dtypes.values
    >>> after: pd.DataFrame = turn_01_columns_into_int(before.copy())
    >>> after_types = after.dtypes.values
    >>> print(after_types[0])
    uint8
    >>> print(after_types[1])
    uint8
    >>> print(f"{before_types[2] == after_types[2]} {before_types[3] == after_types[3]} {before_types[4] == after_types[4]} {before_types[5] == after_types[5]}")
    True True True True
    >>> print(f"{after.attrs['01']}")
    ('int01', 'flt01')
    >>> print(f"{after.attrs['not_01']} ")
    ('intNo', 'fltNo', 'intNan', 'fltNan')
    """
    cols_01: List[str] = []
    not_01:  List[str] = []
    for c in dataframe_to_edit.columns:
        #if dataframe_to_edit[c].dtype == np.uint8:
        #    continue
        if dataframe_to_edit[c].isin([0,1]).all():
            dataframe_to_edit[c] = dataframe_to_edit[c].astype(np.uint8)
            cols_01.append(c)
        else:
            not_01.append(c)
    dataframe_to_edit.attrs["01"] = tuple(cols_01)
    dataframe_to_edit.attrs["not_01"] = tuple(not_01)
    return dataframe_to_edit

if __name__ == "__main__":
    doctest.run_docstring_examples(turn_01_columns_into_int, globals())

def x_to_dataframe(
        x_data: np.ndarray,
        row_major = True,
        x_prefix: str = "x"
) -> pd.DataFrame:
    """
    Converts the 'x' ndarray into a pandas dataframe.
    :param x_data: the ndarray containing all of the data
    :param row_major: is this ndarray held in row-major order? [[item a data], [item b data], ... ]
    :param x_prefix: prefix to put on the names of all of the x columns
    :return: a dataframe holding the given x data.
    """
    if row_major:
        x_data: np.ndarray = x_data.T
    x_df: pd.DataFrame = pd.DataFrame.from_dict({f"{x_prefix}{i}": x_data[i] for i in range(x_data.shape[0])})

    return turn_01_columns_into_int(x_df)



def add_everything_but_x_to_copy_of_dataframe(
        original_df: pd.DataFrame,
        the_data_dict: Dict[str, np.ndarray],
        dont_add: Union[str, Iterable[str]]=frozenset('x')
) -> pd.DataFrame:
    """
    Adds everything in the npz file apart from the given 'dont_add'
    tables to the dataframe.
    Assumes that these other tables have the same shape of (whatever, 1).
    :param original_df: the original dataframe that shall be copied and have stuff added to it.
    :param the_data_dict: The data file with the data to be added to the DataFrame
    :param dont_add: the identifier(s) of the columns that must not be added to the DataFrame.
    :return: a copy of the original dataframe, with the data from every table BESIDES the 'dont add' tables
    from the given file added to it.
    """

    the_df = original_df.copy()
    if dont_add in the_data_dict.keys():
        dont_add = [dont_add]
    for k, v in the_data_dict.items():
        if k in dont_add:
            continue
        the_df[k] = pd.DataFrame(v)

    return turn_01_columns_into_int(the_df)





**********************************************************************
File "__main__", line 38, in NoName
Failed example:
    print(pd.__version__)
Expected:
    1.4.1
Got:
    1.3.1
**********************************************************************
File "__main__", line 52, in NoName
Failed example:
    print(f"{after.attrs['not_01']} ")
Expected:
    ('intNo', 'fltNo', 'intNan', 'fltNan')
Got:
    ('intNo', 'fltNo', 'intNan', 'fltNan') 


# Opening the IHDP dataset

In [9]:
ihdp_dict: Dict[str, np.ndarray] = npz_to_dict("../data/ihdp.npz", allow_pickle = False)
for k,v in ihdp_dict.items():
    print(f"{k}: {v.shape}")

ihdp_df_x: pd.DataFrame = x_to_dataframe(ihdp_dict['x'])
"""Dataframe holding the 'x' data for the IHDP dataset"""


ihdp_df: pd.DataFrame = add_everything_but_x_to_copy_of_dataframe(
    ihdp_df_x.copy(),
    ihdp_dict,
    "x"
)
"""Dataframe holding the entirety of the IHDP dataset"""

ihdp_df_t: pd.DataFrame = ihdp_df.copy()
"Dataframe with clearly marked 't0' and 't1' outcomes for each individual"

ihdp_df_t["t0"] = np.choose(ihdp_df["t"].values, [ihdp_df["yf"].values, ihdp_df["ycf"].values])
ihdp_df_t["t1"] = np.choose(ihdp_df["t"].values, [ihdp_df["ycf"].values, ihdp_df["yf"].values])

ihdp_df_t.attrs["not_01"] = (*ihdp_df_t.attrs["not_01"],  "t0","t1")

ihdp_df_t.head()

x: (747, 25)
t: (747, 1)
yf: (747, 1)
ycf: (747, 1)
ite: (747, 1)


Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x21,x22,x23,x24,t,yf,ycf,ite,t0,t1
0,1.397395,0.996346,-1.105624,-0.879606,0.308569,-1.023402,1,0,0,0,...,0,0,0,1,1,4.771232,-0.298509,4.657928,-0.298509,4.771232
1,0.269033,0.196818,0.383828,0.161703,-0.629189,1.460832,1,0,1,0,...,0,0,0,0,0,2.956273,5.78377,3.428604,2.956273,5.78377
2,1.051537,1.795874,-1.105624,0.161703,-0.629189,0.963985,1,0,1,1,...,0,0,0,1,0,4.164164,7.055789,3.658195,4.164164,7.055789
3,0.662446,0.196818,-0.733261,-0.879606,0.371086,-0.692171,1,0,0,0,...,0,0,0,0,1,6.172307,1.379697,4.585505,1.379697,6.172307
4,0.856992,1.795874,0.011465,-0.879606,0.558638,0.301522,0,1,1,0,...,0,0,0,0,1,7.834469,2.747986,4.265591,2.747986,7.834469


In [10]:
ihdp_df_t.to_csv(
    "ihdp_full.csv",
    index=False
)


# Opening the JOBS dataset


In [11]:
jobs_dict: Dict[str, np.ndarray] = npz_to_dict("../data/jobs.npz", allow_pickle=False)


for k,v in jobs_dict.items():
    print(f"{k}: {v.shape}")


jobs_df_x: pd.DataFrame = x_to_dataframe(jobs_dict['x'])
"Dataframe containing only the X values of jobs"

jobs_df: pd.DataFrame = add_everything_but_x_to_copy_of_dataframe(
    jobs_df_x.copy(),
    jobs_dict,
    "x"
)
"Dataframe for the full jobs dataset"

jobs_df.head()

x: (3212, 17)
t: (3212, 1)
y: (3212, 1)
e: (3212, 1)


Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,t,y,e
0,-0.614282,1.464727,0,0,1,0,2.39325,2.746196,-0.653311,-0.656913,1.627531,2.462337,2.937244,0,0,2.843909,0,0,1,0
1,-0.802463,0.101835,0,0,1,0,0.109885,0.498271,-0.785284,-0.743407,-0.022502,-0.177193,0.082537,0,0,0.038422,0,0,1,0
2,-0.896553,-0.238888,1,0,1,1,-0.085212,-0.148097,-0.847312,-0.781606,-0.361348,-0.286221,-0.303615,0,0,-0.191304,0,0,1,0
3,-0.896553,-0.238888,0,0,0,1,0.405581,0.325594,-0.847312,-0.781606,-0.361348,0.02302,-0.03963,0,0,0.173108,0,0,1,1
4,0.13844,-1.601779,1,0,1,1,-0.722531,-0.212734,-0.01984,-0.156019,-1.422084,-0.514563,-0.331552,0,0,-0.779227,0,0,1,0


In [12]:
jobs_df.to_csv(
    "jobs_full.csv",
    index=False
)