In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import importlib

import itertools as it
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm

from itertools import product
from sklearn.pipeline import make_pipeline

from criteriaetl.utils.expansion_func import (get_value_counts_with_expansion_factor,
    get_percentage_table_with_expansion_factor)
from criteriaetl.utils.display_func import cdisplay, percentage_count_plot
from criteriaetl.utils.common_func import (get_weighted_complete_randomization_series_on_subset, 
    proportional_cut, weighted_qcut, get_partition_bool_columns_dict)
from criteriaetl.transformers.columns_base import (NameTransformer, 
    ReplaceTransformer, SelectTransformer, AssignTransformer)
from criteriaetl.transformers.rows_base import AggregateTransformer
from criteriaetl.transformers.fusion_base import MergeTransformer

from projectetl.utils.dataload import (load_survey_data_do, save_survey_with_pickle,
                                       load_survey_from_pickle)
from projectetl.utils.config import (ENCFT_SURVEY_PATH, ENCFT_PREVIOUS_SURVEY_PATH,
                                     ENCFT_OBJECT_DIR, INFLATION_OBJECT_DIR, DATA_DIR,
                                     INFLATION_DIR)
from projectetl.utils import split_survey_by

In [None]:
get_raw = 0 # if 1 will read the raw data, if 0 will load the data from a previously serialized pickle object
year = 2020 # year we want to analyze
month = 6 # month to analyze
trimester = 2 # trimester of the year we want to analyze

In [None]:
if get_raw:
    survey_raw = load_survey_data_do(ENCFT_SURVEY_PATH)
    save_survey_with_pickle(survey_raw, ENCFT_OBJECT_DIR / 'encft202001-202004.pkl')
else:
    survey_raw = load_survey_from_pickle(ENCFT_OBJECT_DIR / 'encft202001-202004.pkl')

In [None]:
survey_household_raw = survey_raw['Hogar']
survey_member_raw = survey_raw['Miembros']
survey_house_raw = survey_raw['Vivienda']

In [None]:
# split constants
column_name = 'trimestre'
values = [20201, 20202, 20203, 20204]

In [None]:
# split household surveys
survey_household_raw_01, survey_household_raw_02, \
    survey_household_raw_03, survey_household_raw_04 = split_survey_by(survey_household_raw,
                                                                       column_name,
                                                                       values)

In [None]:
# split member surveys
survey_member_raw_01, survey_member_raw_02, \
    survey_member_raw_03, survey_member_raw_04 = split_survey_by(survey_member_raw,
                                                                 column_name,
                                                                 values)

In [None]:
# Current survey
survey_member_raw = locals().get(f'survey_member_raw_0{trimester}')
survey_household_raw = locals().get(f'survey_household_raw_0{trimester}')

In [None]:
# Previous survey
survey_member_previous = locals().get(f'survey_member_raw_0{trimester - 1}')

# Processing previous trimester

In [None]:
trimester_previous = trimester - 1
social_programs_cols = [
    'comer_primero', 'inc_asis_escolar',
    'bono_luz', 'bonogas_choferes', 
    'bonogas_hogares', 'proteccion_vejez',
    'bono_estudiante_prog', 'inc_educacion_sup',
    'inc_policia_prev', 'inc_marina_guerra'
]
social_programs_covid_cols = [
    'quedate_en_casa', 'fondo_asistencia_fase',
    'programa_pati'
]

## Aggregate Transformer

In [None]:
aggregate_previous_map = {
    'sum': [
        *[f'gob_{sp}_monto' for sp in social_programs_cols],
        *[f'gob_{sp}' for sp in social_programs_covid_cols]
    ]
}
key = 'id_hogar'

aggregate_previous_transformer = AggregateTransformer(aggregate_previous_map, groupby_=key, keep_first=[])
survey_previous_aggregated = aggregate_previous_transformer.transform(survey_member_previous)

# Processing current trimester

## Assign Transformer

In [None]:
assign_current_map = {
    **{
        f'recibio_{sp}': lambda df, sp=sp: df[f'gob_{sp}_monto'] > 0.0
        for sp in social_programs_cols
    },
    **{
        f'recibio_{sp}': lambda df, sp=sp: df[f'gob_{sp}'] > 0.0
        for sp in social_programs_covid_cols
    }
}

assign_current_transformer = AssignTransformer(assign_current_map)
survey_current_assigned = assign_current_transformer.transform(survey_member_raw)

## Aggregate Transformer

In [None]:
aggregate_current_map = {
    'sum': [
        *[f'gob_{sp}_monto' for sp in social_programs_cols],
        *[f'gob_{sp}' for sp in social_programs_covid_cols]
    ],
    'any': [
        f'recibio_{sp}'
        for sp in social_programs_cols + social_programs_covid_cols
    ]
}

aggregate_current_transformer = AggregateTransformer(aggregate_current_map, groupby_=key, keep_first=['factor_expansion'])
survey_current_aggregated = aggregate_current_transformer.transform(survey_current_assigned)

## Merge Transformer

In [None]:
merge_current_previous_transformer = MergeTransformer(lambda : survey_previous_aggregated,
                                                      merge_kwargs={
                                                          'on': key,
                                                          'suffixes': ('', f'_{trimester_previous}')})
merge_current_household =  MergeTransformer(lambda : survey_household_raw,
                                            merge_kwargs={'on': key})
survey_current_merged = make_pipeline(
        merge_current_previous_transformer,
        merge_current_household
    ).transform(survey_current_aggregated)

In [None]:
cdisplay(survey_current_merged.head())

## Select Transformer

In [None]:
fix_qec_cols = [
    ('cep', 'comer_primero', 'comer_es_primero', 825.0),
    ('ilae', 'inc_asis_escolar', 'incentivo_asist_escolar', 300)
]

In [None]:
select_current_map = {
    **{
        f'{short}_revisar': {
            lambda df, col=monto_col: df[f'recibio_{col}_any']: 1,
            lambda df, col=monto_col: ~df[f'recibio_{col}_any'] \
                                      & df['recibio_quedate_en_casa_any'] \
                                      & df[f'gob_{col}_monto_sum_1'] > 0.0:
                2,
            lambda df, col=monto_col, check_col=check_col:
                ~df[f'recibio_{col}_any'] \
                & df['recibio_quedate_en_casa_any'] \
                & (df[f'ps_{check_col}'] == 1) \
                & ((df[f'gob_{col}_monto_sum_1'].isna()) \
                    | (df[f'gob_{col}_monto_sum_1'] == 0.0)):
                3,       
            'default': 0
        }
        for short, monto_col, check_col, _ in fix_qec_cols
    },
    **{
        f'{short}_rev': {
            lambda df, short=short: df[f'{short}_revisar'] == 1:
                lambda df, col=monto_col: df[f'gob_{col}_monto_sum'],
            lambda df, short=short: df[f'{short}_revisar'] == 2:
                lambda df, col=monto_col: df[f'gob_{col}_monto_sum_1'],
            lambda df, short=short: df[f'{short}_revisar'] == 3:
                imputed_monto,       
            'default': 0.0
        }
        for short, monto_col, _, imputed_monto in fix_qec_cols
    },
    'qec_cep_rev': {
        lambda df: (df['cep_revisar'] == 2) | (df['cep_revisar'] == 3):
            lambda df: df['gob_quedate_en_casa_sum'] - df['cep_rev'],
        lambda df: df['gob_quedate_en_casa_sum'].isna(): 0.0,
        'default': lambda df: df['gob_quedate_en_casa_sum']
    },
    'qec_rev': {
        lambda df: (df['ilae_revisar'] == 2) | (df['ilae_revisar'] == 3):
            lambda df: df['qec_cep_rev'] - df['ilae_rev'],
        lambda df: df['qec_cep_rev'].isna(): 0.0,
        'default': lambda df: df['qec_cep_rev']
    }
}

select_current_transformer = SelectTransformer(select_current_map)
survey_current_selected = select_current_transformer.transform(survey_current_merged)

In [None]:
survey_current_selected.head()

## Assign Transformer

In [None]:
assign_member_diff_map = {
    'cep_diff': lambda df: df['gob_quedate_en_casa_sum'] - df['qec_cep_rev'],
    'ilae_diff': lambda df: df['qec_cep_rev'] - df['qec_rev'],
    'qec_diff': lambda df: df['gob_quedate_en_casa_sum'] - df['qec_rev']
}

assign_member_diff_transformer = AssignTransformer(assign_member_diff_map)
survey_current_diff_assign = assign_member_diff_transformer.transform(survey_current_selected)

## Replace columns

In [None]:
survey_current_final = survey_current_diff_assign.copy()
survey_current_final['gob_quedate_en_casa_sum'] = survey_current_selected['qec_rev']
survey_current_final['gob_comer_primero_monto_sum'] = survey_current_selected['cep_rev']
survey_current_final['gob_inc_asis_escolar_monto_sum'] = survey_current_selected['ilae_rev']
survey_current_final = survey_current_final[['id_hogar', 'cep_rev', 'ilae_rev', 'qec_rev',
                                             'qec_diff', 'cep_diff', 'ilae_diff']]

In [None]:
survey_current_final

# Save correction

In [None]:
save_survey_with_pickle(survey_current_final, ENCFT_OBJECT_DIR / f'encft-{year}-{trimester}-correction.pkl')