# Overview IHDP dataset

In [5]:
import pandas as pd
import numpy as np 
import seaborn as sns 
sns.set(style='ticks', palette='Set2')
sns.set_context("talk", font_scale=1.2)
import matplotlib.pyplot as plt
import random
import warnings
warnings.filterwarnings('ignore')
import matplotlib
import os
from threading import Timer
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime
from pathlib import Path
import missingno as msno
from dateutil.relativedelta import relativedelta
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
font_path = "C:\\Users\\AniltonCardoso\\OneDrive - BITKA\\Projetos\\CustomFonts"
from matplotlib import font_manager
font_dirs = [font_path]
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)

for font_file in font_files:
    font_manager.fontManager.addfont(font_file)

csfont = {'fontname':'Roboto Condensed'}
hfont = {'fontname':'Roboto Condensed'}
matplotlib.rc('font',family='Roboto Condensed')
plt.rcParams.update({
    "text.usetex": False})

def background_with_norm(s):
    cmap = matplotlib.cm.get_cmap('RdBu') 
    norm = matplotlib.colors.TwoSlopeNorm(vmin=s.values.min(), vcenter=0, vmax=s.values.max())
    return ['background-color: {:s}'.format(matplotlib.colors.to_hex(c.flatten())) for c in cmap(norm(s.values))]

def set_css_properties():
    # Set CSS properties for th elements in dataframe
    th_props = [('font-size', '16px'),
                ('font-name', 'Calibri'),
                ('text-align', 'center'),
                ("border-top", "1px solid #C1C3D1;"),
                ("border-bottom-", "1px solid #C1C3D1;"),
                ("text-shadow", "0 1px 1px rgba(256, 256, 256, 0.1);"),
                ('font-weight', 'normal'),
                ('color', '#D5DDE5'),
                ('background-color', '#1b1e24')]

                
    table_props = [("background-color", "#f0ebeb;"), 
                  ('width', '100%')]

    tr_hover_props = [("background-color", '#b5e3ff'), 
            ('color', 'white'), 
            ('cursor', 'pointer'),
            ('font-weight', "bold"),
        #  ("border-top", "1px solid #22262e;")
            ]

    td_props = [('font-size', '14px'),
                # ("background", "#FFFFFF;"),
                ("text-align", "left;"),
                ('font-name', 'Calibri'),
                ("vertical-align", "middle;"),
                ("font-weight", "500;"),
                ('color', 'black'),
                # ("text-shadow", "-1px -1px 1px rgba(0, 0, 0, 0.1);"),
                ("border-right", "1px solid #C1C3D1;")]

    caption_props = [
                    ('font-size', '20px'),
                    ('font-weight', 'bold')]

    # Set table styles
    styles = [dict(selector="th", props=th_props),
                dict(selector="td", props=td_props),
                dict(selector="caption", props=caption_props),
                dict(selector="tr:hover", props=tr_hover_props),
                dict(selector="tr", props = table_props),
                ]

    return styles

def default_plot_layout(ax, title, xlabel, ylabel):
    ax.set_title(title, size = 16)
    ax.set_ylabel(ylabel, size = 14)
    ax.set_xlabel(xlabel, size = 14)
    plt.xticks(size = 12)
    plt.yticks(size = 12)
    ax.spines["top"].set_visible(False)  
    ax.spines["right"].set_visible(False)  
    
def fill_percentage(ax, total, scale = 1.2):
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x() + p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(height/total * 100),
                ha="center", fontsize = 12 * scale) 

def format_cols(perc_cols, numerical_cols, money_cols, date_cols):
    perc_dict = {x: '{:.2%}' for x in perc_cols}
    num_dict =  {x: '{0:,.2f}' for x in numerical_cols}
    mny_dict =  {x: 'R${0:,.2f}' for x in money_cols}
    dt_dict =  {x: '{:%d-%m-%Y}' for x in date_cols}

    format_dict = dict(**perc_dict, **num_dict)
    format_dict = dict(**format_dict, **mny_dict)
    format_dict = dict(**format_dict, **dt_dict)
    return format_dict

In [6]:
files = Path("../datasets/IHDP/").rglob("*.csv")
columns = ['treatment', 'y_factual', 'y_counterfactual', 'mu0', 'mu1'] + ["x" + str(x) for x in list(np.arange(25) + 1)]
df = pd.concat([pd.read_csv(f, header = None) for f in files], ignore_index = True)
df.columns = columns

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7470 entries, 0 to 7469
Data columns (total 30 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   treatment         7470 non-null   int64  
 1   y_factual         7470 non-null   float64
 2   y_counterfactual  7470 non-null   float64
 3   mu0               7470 non-null   float64
 4   mu1               7470 non-null   float64
 5   x1                7470 non-null   float64
 6   x2                7470 non-null   float64
 7   x3                7470 non-null   float64
 8   x4                7470 non-null   float64
 9   x5                7470 non-null   float64
 10  x6                7470 non-null   float64
 11  x7                7470 non-null   int64  
 12  x8                7470 non-null   int64  
 13  x9                7470 non-null   int64  
 14  x10               7470 non-null   int64  
 15  x11               7470 non-null   int64  
 16  x12               7470 non-null   int64  


In [9]:
df.head()

Unnamed: 0,treatment,y_factual,y_counterfactual,mu0,mu1,x1,x2,x3,x4,x5,...,x16,x17,x18,x19,x20,x21,x22,x23,x24,x25
0,1,5.599916,4.31878,3.268256,6.854457,-0.528603,-0.343455,1.128554,0.161703,-0.316603,...,1,1,1,1,0,0,0,0,0,0
1,0,6.875856,7.856495,6.636059,7.562718,-1.736945,-1.802002,0.383828,2.24432,-0.629189,...,1,1,1,1,0,0,0,0,0,0
2,0,2.996273,6.633952,1.570536,6.121617,-0.807451,-0.202946,-0.360898,-0.879606,0.808706,...,1,0,1,1,0,0,0,0,0,0
3,0,1.366206,5.697239,1.244738,5.889125,0.390083,0.596582,-1.85035,-0.879606,-0.004017,...,1,0,1,1,0,0,0,0,0,0
4,0,1.963538,6.202582,1.685048,6.191994,-1.045229,-0.60271,0.011465,0.161703,0.683672,...,1,1,1,1,0,0,0,0,0,0


In [20]:
df.groupby('treatment')[['y_factual', 'y_counterfactual']].mean()

Unnamed: 0_level_0,y_factual,y_counterfactual
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9.152685,14.081442
1,14.094862,10.138704
