In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import importlib

import itertools as it
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm

from itertools import product
from pandas_profiling import ProfileReport
from sklearn.pipeline import make_pipeline

from criteriaetl.utils.expansion_func import (get_value_counts_with_expansion_factor,
    get_percentage_table_with_expansion_factor)
from criteriaetl.utils.display_func import cdisplay, percentage_count_plot
from criteriaetl.utils.common_func import (get_weighted_complete_randomization_series_on_subset, 
    proportional_cut, weighted_qcut, get_partition_bool_columns_dict)
from criteriaetl.transformers.columns_base import (NameTransformer, 
    ReplaceTransformer, SelectTransformer, AssignTransformer)
from criteriaetl.transformers.rows_base import AggregateTransformer
from criteriaetl.transformers.fusion_base import MergeTransformer

from projectetl.utils.dataload import (load_survey_data_do, save_survey_with_pickle,
                                       load_survey_from_pickle)
from projectetl.utils.config import (ENHOGAR_DIR, ENCFT_SURVEY_PATH, ENCFT_PREVIOUS_SURVEY_PATH,
                                     ENCFT_OBJECT_DIR)

# Configuration variables

In [None]:
get_raw = 0

# Load surveys

## ENHOGAR 2016

In [None]:
enhogar_elected_raw = load_survey_data_do(ENHOGAR_DIR / 'Elegidos_ENHOGAR2016_PUB.sav',
                                         load_func=lambda path: pd.read_spss(str(path)))
enhogar_member_raw = load_survey_data_do(ENHOGAR_DIR / 'Personas_ENHOGAR2016_PUB.sav',
                                        load_func=lambda path: pd.read_spss(str(path)))

## ENCFT

In [None]:
if get_raw:
    encft_raw = load_survey_data_do(ENCFT_SURVEY_PATH)
    save_survey_with_pickle(encft_raw, ENCFT_OBJECT_DIR / 'encft202001-202004.pkl')
else:
    encft_raw = load_survey_from_pickle(ENCFT_OBJECT_DIR / 'encft202001-202004.pkl')

In [None]:
encft_member_raw = encft_raw['Miembros']

# Preprocess ENCFT

In [None]:
encft_care_features = [
    'sexo',
    'parentesco',
    'estado_civil',
    'nivel_ultimo_ano_aprobado',
    'ultimo_ano_aprobado',
    'curso_matriculado',
    'trabajo_semana_pasada',
    'busco_trabajo_establ_negocio',
    'aceptaria_trab_sem_pasada',
    # # 'categoria_cesantia',
    'categoria_principal',
    'horas_trabaja_semana_principal',
    'afiliado_seguro_salud',
    # 'mayores_10_annos', # create
    'asiste_centro_educativo',
    'realiza_curso_tecnico',
    # 'grupo_etareo', # create
    # 'condicion_actividad', # create
    'miembro',
    'id_provincia',
    'zona',
]

encft_domestic_work_features = encft_care_features + [
    'sabe_leer_escribir',
    'nivel_se_matriculo'
]

merge_cols = [
'trimestre', 'id_hogar', 'id_persona',
]

encft_extra_columns = [
    'edad',
    'motivo_no_busca_trabajo', 'motivo_no_disponible',
    'motivo_no_disponible_esp', 'realizo_actividad',
    'motivo_no_trab_sem_pasada', 'tenia_empleo_negocio',
    'que_hizo_buscar_trabajo', 'factor_expansion'
]

In [None]:
encft_member = encft_member_raw[merge_cols + encft_domestic_work_features + encft_extra_columns][encft_member_raw['edad'] >= 10]

## Select Transformer

In [None]:
age_groups = [(10, 14), (15, 24), (25, 45), (46, 59), (60, 74), (75, None)]

In [None]:
encft_select_member_map = {
    'condicion_actividad': {
        # ocupados: 
        # trabajo_semana_pasada - realizo algun trabajo la semana anterior por lo menos una hora: sí, o
        # realizo_actividad - en la semana anterior: realizo alguna actividad para 
        # obtener dinero o especies, o,
        # tenia_empleo_negocio - tiene un negocio o empresa a la que proximamente volverá: sí
        lambda df: (df['trabajo_semana_pasada'] == 'Sí') \
                    | (df['realizo_actividad'].isin(range(1, 8))) \
                    | (df['tenia_empleo_negocio'] == 'Sí') \
                    | (df['motivo_no_trab_sem_pasada']).isin(range(1, 10)): 1, #'ocupado',
        
        lambda df: (df['motivo_no_busca_trabajo'] == 9) \
                    | (df['motivo_no_disponible'] == 4) \
                    | ((df['motivo_no_busca_trabajo'] == 6) \
                        & (df['edad'] > 65)) \
                    | ((df['motivo_no_disponible_esp'] == 'EDAD') \
                        & (df['edad'] > 65)): 2, #'incapacitado o anciano',
        lambda df: (df['motivo_no_busca_trabajo'] == 10) | (df['motivo_no_disponible'] == 3): 3, #'jubilado o rentista',
        lambda df: (df['motivo_no_busca_trabajo'] == 8) | (df['motivo_no_disponible'] == 2): 4, #'quehacer doméstico',
        lambda df: (df['motivo_no_busca_trabajo'] == 7) | (df['motivo_no_disponible'] == 1): 5, #'estudiante',

        # desocupados
        # busco_trabajo_establ_negocio - en las 4 semanas anteriores estuvo buscando trabajo o 
        # tratando de emprender: si, y
        # que_hizo_buscar_trabajo - en las cuantro semanas efectivamente hizo algo para 
        # buscar trabajo o emprender
        lambda df: (df['busco_trabajo_establ_negocio'] == 1) \
                   | df['que_hizo_buscar_trabajo'].isin(range(1, 6)) \
                   | df['motivo_no_busca_trabajo'] == 1: 6, #'desocupado',

        # inactivos
        'default': 7, #'inactivo'
    },

    'grupo_etareo': {
        lambda df: df['edad'] == '97+': len(age_groups),
        **{
            lambda df, min_group=min_group, max_group=max_group:
                df['edad'].isin(range(min_group, (max_group or 96) + 1)):
                lambda _, i=i: i + 1
            for i, (min_group, max_group) in enumerate(age_groups)
        },
        'default': 0
    }
}

encft_select_member_transformer = SelectTransformer(encft_select_member_map)
encft_member_selected = encft_select_member_transformer.transform(encft_member)

## Assign Transformer

In [None]:
encft_assign_member_map = {
    'mayores_10_annos': lambda df: df.groupby(['trimestre', 'id_hogar'])['edad'].transform(lambda df: len(df >= 10)),
}

encft_assign_member_transformer = AssignTransformer(encft_assign_member_map)
encft_member_assigned = encft_assign_member_transformer.transform(encft_member_selected)

## Fill NaN

In [None]:
columns_with_nan = [
    'horas_trabaja_semana_principal', 'ultimo_ano_aprobado',
    'curso_matriculado', 'miembro',
    'aceptaria_trab_sem_pasada', 'asiste_centro_educativo',
    'busco_trabajo_establ_negocio', 'estado_civil',
    'trabajo_semana_pasada', 'nivel_ultimo_ano_aprobado',
    'categoria_principal', 'realiza_curso_tecnico',
    'sabe_leer_escribir', 'nivel_se_matriculo'
]
encft_member_assigned[columns_with_nan] = encft_member_assigned[columns_with_nan].fillna(0)

## Replace Transformer

In [None]:
encft_replace_member_map = {
    'sexo': {
        1: 0, # hombre
        2: 1 # mujer
    },

    'parentesco': {
        11: 10, # group 'abuelo' under 'otro pariente' category
        12: 11
    },

    'nivel_ultimo_ano_aprobado': {
        4: 3,
        5: 4,
        6: 5,
        7: 5,
        8: 5,
        10: 0,
        99: 0,
        # np.NaN: 9
    },

    'nivel_se_matriculo': {
        4: 3,
        5: 4,
        6: 5,
        7: 5,
        8: 5,
        10: 0,
        99: 0,
        # np.NaN: 9
    },

    'categoria_principal': {
        5: 4,
        6: 5,
        7: 6,
        8: 7,
        4: 0,
        # np.NaN: 9
    }
}

categorical = [
    'parentesco',
    'nivel_ultimo_ano_aprobado',
    'nivel_se_matriculo',
    'categoria_principal',
    'afiliado_seguro_salud',
    'realiza_curso_tecnico',
    'id_provincia',
    'sabe_leer_escribir'
]

integers = [
    'aceptaria_trab_sem_pasada', 'asiste_centro_educativo',
    'busco_trabajo_establ_negocio', 'estado_civil',
    'trabajo_semana_pasada', 'curso_matriculado',
    'ultimo_ano_aprobado'
]

encft_astype_member_map = {
    **{col: 'category' for col in categorical},
    **{col: int for col in integers}
}

encft_replace_member_transformer = ReplaceTransformer(encft_replace_member_map,
                                                      astype_dict=encft_astype_member_map)
encft_member_replaced = encft_replace_member_transformer.transform(encft_member_assigned)

In [None]:
cdisplay(encft_member_replaced)

## Change categories dtypes

In [None]:
change_dtype_cols = [
    'categoria_principal',
    'nivel_ultimo_ano_aprobado',
    'realiza_curso_tecnico',
    'sabe_leer_escribir',
    'nivel_se_matriculo'
]

In [None]:
for col in change_dtype_cols:
    encft_member_replaced[col].cat.categories = pd.Int64Index(encft_member_replaced[col].cat.categories)

## MinMax Scaler

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
encft_member_scaled = encft_member_replaced.copy()
encft_member_scaled[['horas_trabaja_semana_principal']] = MinMaxScaler().fit_transform(encft_member_scaled[['horas_trabaja_semana_principal']])

## Final dataset

In [None]:
encft_care_features += ['mayores_10_annos', 'condicion_actividad', 'grupo_etareo']
encft_domestic_work_features += ['mayores_10_annos', 'condicion_actividad', 'grupo_etareo']

### Care dataset

In [None]:
encft_care_pre_dummies = encft_member_replaced[merge_cols + encft_care_features]
columns_care = set(encft_care_pre_dummies.columns) - set(encft_care_pre_dummies.select_dtypes(float).columns) - set(['mayores_10_annos', 'miembro', 'sexo', 'zona', 'curso_matriculado', 'ultimo_ano_aprobado'] + merge_cols)

In [None]:
columns_care

In [None]:
encft_care_impute = pd.get_dummies(encft_care_pre_dummies, drop_first=True, columns=columns_care)

In [None]:
cdisplay(encft_care_impute)

### Domestic work dataset

In [None]:
encft_domestic_work_pre_dummies = encft_member_replaced[merge_cols + encft_domestic_work_features]
columns_domestic_work = set(encft_domestic_work_pre_dummies.columns) - set(encft_domestic_work_pre_dummies.select_dtypes(float).columns) - set(['mayores_10_annos', 'miembro', 'sexo', 'zona', 'curso_matriculado', 'ultimo_ano_aprobado'] + merge_cols)

In [None]:
columns_domestic_work

In [None]:
encft_domestic_work_impute = pd.get_dummies(encft_domestic_work_pre_dummies, drop_first=True, columns=columns_domestic_work)

In [None]:
cdisplay(encft_domestic_work_impute)

### Checks

In [None]:
set(pd.get_dummies(encft_care_pre_dummies, columns=columns_care).columns) - set(pd.get_dummies(encft_care_pre_dummies, drop_first=True, columns=columns_care).columns)

In [None]:
set(pd.get_dummies(encft_domestic_work_pre_dummies, columns=columns_domestic_work).columns) - set(pd.get_dummies(encft_domestic_work_pre_dummies, drop_first=True, columns=columns_domestic_work).columns)

# Preprocess ENHOGAR 2016

In [None]:
basic_cols = [
    'hhupm', 'hhviv', 'hhogar'
]

## Assign Transformer

In [None]:
enhogar_assign_member_map = {
    'temp': lambda _: 1,
    'h703': lambda df, cols=basic_cols: df.groupby(cols)['temp'].transform('cumsum'),
}

enhogar_assign_member_transformer = AssignTransformer(enhogar_assign_member_map)
enhogar_member_assigned = enhogar_assign_member_transformer.transform(enhogar_member_raw)

## Merge Transformer

In [None]:
enhogar_merge_member_elected_transformer = MergeTransformer(lambda : enhogar_member_assigned,
                                                            merge_kwargs={
                                                                'on': basic_cols + ['h703'],
                                                                'suffixes': ('', '_other')
                                                            })
enhogar_elected_merged = enhogar_merge_member_elected_transformer.transform(enhogar_elected_raw)

## Name Transformer

In [None]:
enhogar_name_elected_map = {
    'h402': 'sexo', # sexo
    'h403': 'edad', # edad
    'h405': 'parentesco', # parentesco
    'h407': 'estado_civil',
    'h409': 'aprendio_leer_escribir',
    'h410': 'donde_aprendio_leer_escribir',
    'h411': 'asiste_asistio_centro_educativo',
    'h412': 'nivel_educativo_mas_alto',
    'h413': 'curso_mas_alto_que_curso',
    'h414': 'asiste_centro_educativo_actualmente',
    'h415': 'nivel_esta_asistiendo',
    'h416': 'curso_esta_asistiendo',
    'h501': 'trabajo_semana_pasada',
    'h502': 'tenia_empleo_semana_pasada',
    'h503': 'cultivo_cosecho_semana_pasada',
    'h504': 'elaboro_producto_semana_pasada',
    'h505': 'ayudo_familiar_semana_pasada',
    'h506': 'tuvo_trabajo_domestico_por_paga_semana_pasada',
    'h507': 'busco_trabajo_cuatro_semanas_pasadas',
    'h508': 'motivo_no_busca_trabajo',
    'lh509': 'podria_aceptar_trabajo_semana_pasada',
    'lh510': 'dispone_condiciones_para_trabajar',
    'lh511': 'dispone_condiciones_para_trabajar_semana_pasada',
    'lh512': 'ha_trabajado_por_paga',
    'lh513': 'ultima_principal_ocupacion',
    'lh514': 'se_dedica_ultima_empresa',
    'lh515': 'ocupacion_ultimo_trabajo',
    'lh516': 'horas_trabajadas_semanalmente_ocupacion_principal',
    'h601': 'tiene_seguro_salud_pension',
    'h702a': 'mayores_10_annos',
    'h704': 'edad_elegido',
    'h1001': 'trabajo_por_paga_semana_pasada',
    'h1002': 'dias_trabajo_semana_pasada',
    'h1003': 'horas_trabajo_semana_pasada',
    'h1005': 'estudia_actualmente',
    'h1009': 'realiza_actividad_vocacional',
    'h1015': 'dedico_tiempo_cuidados',
    'h1016': 'tiempo_cuidados_dias',
    'h1017': 'tiempo_cuidado_horas_diarias',
    'h1030': 'tiempo_agricultura_ganado_dias',
    'h1032': 'tiempo_trabajo_sin_paga_dias',
    'h1034': 'tiempo_trabajo_domestico_hogar_dias',
    'h1035': 'tiempo_trabajo_domestico_hogar_horas',
    'h1036': 'tiempo_trabajo_domestico_no_hogar_dias',
    'h1037': 'tiempo_trabajo_domestico_no_hogar_horas'
}

keep = ['hhupm', 'hhviv', 'hhogar', 'hzona', 'hprovi',
        # Stratos
        'hestrat', 'region', 'grupsec', 'zona', 'hmiembro',
        'hprovin', 'estratos',
        # Factors
        'fexpansion', 'fponderacion',
        'fexpansion_tic', 'fponderacion_tic',]

enhogar_name_elected_transformer = NameTransformer(enhogar_name_elected_map, keep_features=keep)
enhogar_elected_named = enhogar_name_elected_transformer.transform(enhogar_elected_merged)

## Fill NaN

In [None]:
enhogar_elected_named['nivel_esta_asistiendo'].isna().sum()

In [None]:
float_columns_with_nan = ['mayores_10_annos']
enhogar_elected_named[float_columns_with_nan] = enhogar_elected_named[float_columns_with_nan].fillna(0)

In [None]:
columns_with_nan = [
    'estado_civil', 'donde_aprendio_leer_escribir',
    'nivel_educativo_mas_alto',
    'asiste_centro_educativo_actualmente', 'nivel_esta_asistiendo',
    'tenia_empleo_semana_pasada',
    'cultivo_cosecho_semana_pasada', 'elaboro_producto_semana_pasada',
    'ayudo_familiar_semana_pasada',
    'tuvo_trabajo_domestico_por_paga_semana_pasada',
    'busco_trabajo_cuatro_semanas_pasadas',
    'podria_aceptar_trabajo_semana_pasada',
    'ocupacion_ultimo_trabajo',
    'estudia_actualmente',
    'realiza_actividad_vocacional', 'dedico_tiempo_cuidados',
    'curso_mas_alto_que_curso', 'curso_esta_asistiendo',
    'horas_trabajadas_semanalmente_ocupacion_principal',
    'nivel_esta_asistiendo'
]

for cat in columns_with_nan:
    if 0 not in enhogar_elected_named[cat].cat.categories:
        enhogar_elected_named[cat] = enhogar_elected_named[cat].cat.add_categories(0).fillna(0)
    else:
        enhogar_elected_named[cat] = enhogar_elected_named[cat].fillna(0)

## Replace Transformer

In [None]:
motivos_no_busca_trabajo = [
    'Ha buscado trabajo y no encuentra', 'Familiar y otras personas le están buscando',
    'Solicitó y espera respuesta', 'Está incapacitado permanentemente',
    'Está Incapacitado temporalmente', 'Está estudiando', 'Se dedica a quehaceres del hogar',
    'Es rentista', 'No tiene suficiente educación o experiencia', 'Cree que no va a encontrar',
    'Por razones de edad', 'No quiso buscar trabajo', 'Pensionado o jubilado'
]

In [None]:
enhogar_replace_elected_map = {
    'motivo_no_busca_trabajo': {
        **{reason: i + 1 for i, reason in enumerate(motivos_no_busca_trabajo)},
        'Otro': 98
    },
    'curso_mas_alto_que_curso': {
        'No sabe': 9,
        99: 9,
        # np.NaN: 9
    },
    'curso_esta_asistiendo': {
        'No sabe': 9,
        99: 9,
        # np.NaN: 9
    },
    'horas_trabajadas_semanalmente_ocupacion_principal': {
        'No sabe': 0.0,
        99.0: 0.0
    },
    'sexo': {
        'Hombre': 0,
        'Mujer': 1
    },
    'parentesco': {
        'El (la) Jefe(a)': 1,
        'Esposo(a) o compañero(a)': 2,
        'Hijo(a)': 3,
        'Yerno o nuera': 6,
        'Nieto(a)': 5,
        'Padre o madre': 7,
        'Suegro(a)': 8,
        'Hermano(a)': 9,
        'Cuñado(a)': 10,
        'Tío(a)': 10,
        'Sobrino(a)': 10,
        'Otro pariente': 10,
        'Adoptado/ hijo de crianza': 4,
        'Trabajador(a) doméstico(a)': 11,
        'Otro': 11,
        99.0: 0,
    },
    'estado_civil': {
        'Casado(a)': 2,
        'Unido(a)': 1,
        'Viudo(a)': 5,
        'Divorciado(a)': 3, 
        'Separado de matrimonio legal o religioso': 4,
        'Separado(a) de unión libre': 4,
        'Soltero(a)': 6,
        9.0: 0,
        # np.NaN: 9
    },

    'nivel_educativo_mas_alto': {
        'Inicial o pres-escolar' : 0,
        'Básico o Primario' : 1,
        'Medio o Secundario' : 2,
        'Universitario' : 3,
        'Postgrado, Maestría o Doctorado' : 4,
        'No sabe' : 0,
        'Sin información' : 0,
    },
    'nivel_esta_asistiendo': {
        'Inicial o pres-escolar' : 0,
        'Básico o Primario' : 1,
        'Medio o Secundario' : 2,
        'Universitario' : 3,
        'Postgrado, Maestría o Doctorado' : 4,
        'No sabe' : 0,
        'Sin información' : 0,
        9.0: 0
    },
    'trabajo_semana_pasada': {
        'Sí': 1,
        'No': 2,
        9.0: 0,
        
    },
    'busco_trabajo_cuatro_semanas_pasadas': {
        'Sí': 1,
        'No': 2,
        9.0: 0,
    },
    'podria_aceptar_trabajo_semana_pasada': {
        'Sí': 1,
        'No': 2,
        9.0: 0,
    },
    'ocupacion_ultimo_trabajo': {
        'empleado u obrero del gobierno central o municipal?': 1,
        'empleado u obrero de empresa pública?': 2,
        'empleado u obrero de empresas privadas?': 3,
        'empleador o patrón?': 5,
        'trabajador(a) para un familiar o no familiar sin paga o ganancia?': 7,
        'profesional que trabaja por cuenta propia/': 6,
        'no profesional que trabaja por cuenta propia?': 6,
        'trabajador(a) doméstica?': 4,
        'Otro': 0,
        'sin información': 0
    },
    'tiene_seguro_salud_pension': {
        'Sí': 1,
        'No': 2,
        'No sabe': 0,
        'Sin información': 0,
    },
    'podria_aceptar_trabajo_semana_pasada': {
        'Sí': 1,
        'No': 2,
        9.0: 0,
    },
    'estudia_actualmente': {
        'Si': 1,
        'No': 2,
        9.0: 0,
    },
    'realiza_actividad_vocacional': {
        'Si': 1,
        'No': 2,
        9.0: 0,
    },

    'hprovin': {
        'Distrito Nacional': 1,
        'Azua': 2,
        'Bahoruco': 3,
        'Barahona': 4,
        'Dajabón': 5,
        'Duarte': 6,
        'Elías Piña': 7,
        'El Seibo': 8,
        'Espaillat': 9,
        'Independencia': 10,
        'La Altagracia': 11,
        'La Romana': 12,
        'La Vega': 13,
        'María Trinidad Sánchez': 14,
        'Monte Cristi': 15,
        'Pedernales': 16,
        'Peravia': 17,
        'Puerto Plata': 18,
        'Salcedo': 19,
        'Samaná': 20,
        'San Cristóbal': 21,
        'San Juan': 22,
        'San Pedro de Macorís': 23,
        'Sánchez Ramírez': 24,
        'Santiago': 25,
        'Santiago Rodríguez': 26,
        'Valverde': 27,
        'Monseñor Nouel': 28,
        'Monte Plata': 29,
        'Hato Mayor': 30,
        'San José de Ocoa': 31,
        'Santo Domingo': 32,
    },
    'zona': {
        'Urbano': 1,
        'Rural': 2
    },
    'dedico_tiempo_cuidados': {
        'Si': 1,
        'No': 0,
        9.0: 0,
        np.NaN: 0
    },
    'aprendio_leer_escribir': {
        'Sí': 1,
        'No': 2,
        'No sabe': 0,
        9: 0,
    }
}

enhogar_astype_elected_map = {
    'horas_trabajadas_semanalmente_ocupacion_principal': float,
    'curso_mas_alto_que_curso': int,
    'curso_esta_asistiendo': int,
    'mayores_10_annos': int,
    'hmiembro': int,
    'motivo_no_busca_trabajo': 'category',
    'parentesco': 'category',
    'estado_civil': 'category',
    'nivel_educativo_mas_alto': 'category',
    'trabajo_semana_pasada': 'category',
    'busco_trabajo_cuatro_semanas_pasadas': 'category',
    'podria_aceptar_trabajo_semana_pasada': 'category',
    'ocupacion_ultimo_trabajo': 'category',
    'tiene_seguro_salud_pension': 'category',
    'podria_aceptar_trabajo_semana_pasada': 'category',
    'estudia_actualmente': 'category',
    'realiza_actividad_vocacional': 'category',
    'hprovin': 'category',
    'dedico_tiempo_cuidados': 'category',
    'aprendio_leer_escribir': 'category'
}

enhogar_replace_elected_transformer = ReplaceTransformer(enhogar_replace_elected_map, astype_dict=enhogar_astype_elected_map)
enhogar_elected_replaced = enhogar_replace_elected_transformer.transform(enhogar_elected_named)

In [None]:
list(enhogar_elected_replaced['hmiembro'].unique())

## Select Transformer

In [None]:
enhogar_select_elected_map = {
    'grupo_etareo': {
        lambda df: df['edad'] == '97+': len(age_groups),
        **{
            lambda df, min_group=min_group, max_group=max_group:
                pd.to_numeric(df['edad'], errors='coerce').isin(range(min_group, (max_group or 96) + 1)):
                lambda _, i=i: i + 1
            for i, (min_group, max_group) in enumerate(age_groups)
        },
        'default': 0
    },

    'condicion_actividad': {
        # ocupados: 
        # trabajo_semana_pasada - realizo algun trabajo la semana anterior por lo menos una hora: sí, o
        # tuvo_trabajo_domestico_por_paga_semana_pasada - en la semana anterior: realizo alguna actividad para 
        # obtener dinero o especies, o,
        # tenia_empleo_semana_pasada - tiene un negocio o empresa a la que proximamente volverá: sí
        lambda df: (df['trabajo_semana_pasada'] == 1) \
                    | (df['tuvo_trabajo_domestico_por_paga_semana_pasada'] == 'Sí') \
                    | (df['tenia_empleo_semana_pasada'] == 'Sí'): 1, # 'ocupado',
        
        lambda df: df['motivo_no_busca_trabajo'].isin([4, 5, 11]): 2, # 'incapacitado o anciano',
        lambda df: df['motivo_no_busca_trabajo'].isin([8, 13]): 3, # 'jubilado o rentista',
        lambda df: (df['motivo_no_busca_trabajo'] == 7): 4, # 'quehacer doméstico',
        lambda df: (df['motivo_no_busca_trabajo'] == 6): 5, # 'estudiante',

        # desocupados
        # busco_trabajo_cuatro_semanas_pasadas - en las 4 semanas anteriores estuvo buscando trabajo o 
        # tratando de emprender: si, y
        # motivo_no_busca_trabajo = 1, 2, 3
        # r408 - en las cuantro semanas efectivamente hizo algo para 
        # buscar trabajo o emprender
        lambda df: (df['busco_trabajo_cuatro_semanas_pasadas'] == 1) \
                   | df['motivo_no_busca_trabajo'].isin([1, 2, 3]): 6, # 'desocupado',

        # inactivos
        'default': 7, #'inactivo'
    },

    'tiempo_cuidados_horas_semanales': {
        lambda df: (df['dedico_tiempo_cuidados'] == 'No') | (df['dedico_tiempo_cuidados'].isna()): 0.0,
        'default': lambda df: df['tiempo_cuidados_dias'] * df['tiempo_cuidado_horas_diarias']
    },

    'tiempo_trabajo_domestico_hogar_horas_semanales': {
        lambda df: (df['tiempo_trabajo_domestico_hogar_dias'] == '97') \
                    | (df['tiempo_trabajo_domestico_hogar_dias'] == 99.0): 0.0,
        lambda df: df['tiempo_trabajo_domestico_hogar_dias'].isna(): 0.0,
        'default': lambda df: df['tiempo_trabajo_domestico_hogar_dias'].astype(float) * df['tiempo_trabajo_domestico_hogar_horas']
    },

    'tiempo_trabajo_domestico_no_hogar_horas_semanales': {
        lambda df: (df['tiempo_trabajo_domestico_no_hogar_dias'] == '97') \
                    | (df['tiempo_trabajo_domestico_no_hogar_dias'] == 99.0): 0.0,
        lambda df: df['tiempo_trabajo_domestico_no_hogar_dias'].isna(): 0.0,
        'default': lambda df: df['tiempo_trabajo_domestico_no_hogar_dias'].astype(float) * df['tiempo_trabajo_domestico_hogar_horas']
    }
}

enhogar_select_elected_transformer = SelectTransformer(enhogar_select_elected_map)
enhogar_elected_selected = enhogar_select_elected_transformer.transform(enhogar_elected_replaced)

## Assign Transformer

In [None]:
enhogar_assign_elected_map = {
    'tiempo_trabajo_doméstico_no_remunerado_horas_semanales':
        lambda df: df['tiempo_trabajo_domestico_hogar_horas_semanales'] + df['tiempo_trabajo_domestico_no_hogar_horas_semanales'],

    'dedico_tiempo_trabajo_doméstico_no_remunerado': lambda df: df['tiempo_trabajo_doméstico_no_remunerado_horas_semanales'] > 0,
}

enhogar_assign_elected_transformer = AssignTransformer(enhogar_assign_elected_map)
enhogar_elected_assigned = enhogar_assign_elected_transformer.transform(enhogar_elected_selected)

## Name Transformer

In [None]:
enhogar_name_encft_map = {
    'nivel_educativo_mas_alto': 'nivel_ultimo_ano_aprobado',
    'curso_mas_alto_que_curso': 'ultimo_ano_aprobado',
    'curso_esta_asistiendo': 'curso_matriculado',
    'trabajo_semana_pasada': 'trabajo_semana_pasada',
    'busco_trabajo_cuatro_semanas_pasadas': 'busco_trabajo_establ_negocio',
    'podria_aceptar_trabajo_semana_pasada': 'aceptaria_trab_sem_pasada',
    'ocupacion_ultimo_trabajo': 'categoria_principal',
    'horas_trabajadas_semanalmente_ocupacion_principal': 'horas_trabaja_semana_principal',
    'tiene_seguro_salud_pension': 'afiliado_seguro_salud',
    'estudia_actualmente': 'asiste_centro_educativo',
    'realiza_actividad_vocacional': 'realiza_curso_tecnico',
    'hmiembro': 'miembro',
    'hprovin': 'id_provincia',
    'fexpansion_tic': 'factor_expansion',
    'aprendio_leer_escribir': 'sabe_leer_escribir',
    'nivel_esta_asistiendo': 'nivel_se_matriculo'
}

enhogar_name_encft_transformer = NameTransformer(enhogar_name_encft_map, keep_features=True)
enhogar_elected_encft_named = enhogar_name_encft_transformer.transform(enhogar_elected_assigned)

## Sort categories

In [None]:
enhogar_elected_encft_named['parentesco'].cat.categories

In [None]:
sort_categories_cols = ['afiliado_seguro_salud', 'asiste_centro_educativo', 'parentesco', 'realiza_curso_tecnico', 'trabajo_semana_pasada']

for col in sort_categories_cols:
    first = enhogar_elected_encft_named[col].cat.categories[0]
    enhogar_elected_encft_named[col].cat.categories = enhogar_elected_encft_named[col].cat.categories[1:].append(pd.Int64Index([first]))

## MinMaxScaler

In [None]:
enhogar_elected_scaled = enhogar_elected_encft_named.copy()
enhogar_elected_scaled[['horas_trabaja_semana_principal']] = MinMaxScaler().fit_transform(enhogar_elected_scaled[['horas_trabaja_semana_principal']])

## Final dataset

### Care dataset

In [None]:
enhogar_care_pre_dummies = enhogar_elected_encft_named[encft_care_features]
categories_care = set(enhogar_care_pre_dummies.columns) - set(enhogar_care_pre_dummies.select_dtypes(float))
categories_care -= set(['zona', 'sexo', 'curso_matriculado', 'ultimo_ano_aprobado', 'mayores_10_annos', 'miembro'])

In [None]:
enhogar_care_train = pd.get_dummies(enhogar_care_pre_dummies, drop_first=True, columns=categories_care)

In [None]:
set(pd.get_dummies(enhogar_care_pre_dummies, columns=categories_care)) - set(pd.get_dummies(enhogar_care_pre_dummies, drop_first=True, columns=categories_care))

### Check distribution plot

In [None]:
# for col in enhogar_train_X_pre_dummies.select_dtypes((float, int)).columns:
#     enhogar_x = enhogar_train_X_pre_dummies[col]
#     encft_x = encft_impute_pre_dummies[col]
#     sns.distplot(enhogar_train_X_pre_dummies[col], kde=False, norm_hist=True, label='enhogar', hist_kws={'bins': enhogar_x.max()})
#     sns.distplot(encft_impute_pre_dummies[col], ax=plt.gca(), kde=False, norm_hist=True, label='encft', hist_kws={'bins': encft_x.max()})
#     plt.legend()
#     plt.show()

In [None]:
enhogar_len = len(enhogar_care_pre_dummies)
encft_len = len(encft_care_pre_dummies)

In [None]:
enhogar_care_pre_dummies['afiliado_seguro_salud'].value_counts()

In [None]:
for col in enhogar_care_pre_dummies.columns:
    sns.barplot(x=enhogar_care_pre_dummies[col].astype(int), y=enhogar_care_pre_dummies[col].astype(int), label='enhogar', color='blue', alpha=.6, estimator=lambda x: len(x) / enhogar_len * 100)
    sns.barplot(x=encft_care_pre_dummies[col].astype(int), y=encft_care_pre_dummies[col].astype(int), ax=plt.gca(), label='encft', color='orange', alpha=.6, estimator=lambda x: len(x) / encft_len * 100)
    plt.legend()
    plt.show()

In [None]:
enhogar_care_pre_dummies['nivel_ultimo_ano_aprobado'].value_counts()

In [None]:
cdisplay(enhogar_care_train)

### Domestic work dataset

In [None]:
enhogar_domestic_work_pre_dummies = enhogar_elected_encft_named[encft_domestic_work_features]
categories_domestic_work = set(enhogar_domestic_work_pre_dummies.columns) - set(enhogar_domestic_work_pre_dummies.select_dtypes(float))
categories_domestic_work -= set(['zona', 'sexo', 'curso_matriculado', 'ultimo_ano_aprobado', 'mayores_10_annos', 'miembro'])

In [None]:
enhogar_domestic_work_train = pd.get_dummies(enhogar_domestic_work_pre_dummies, drop_first=True, columns=categories_domestic_work)

In [None]:
set(pd.get_dummies(enhogar_domestic_work_pre_dummies, columns=categories_domestic_work)) - set(pd.get_dummies(enhogar_domestic_work_pre_dummies, drop_first=True, columns=categories_domestic_work))

## Drop columns

### Care drops

In [None]:
enhogar_encft_care_diff = (set(enhogar_care_train.columns) - set(encft_care_impute.columns))
enhogar_encft_care_diff

In [None]:
encft_enhogar_care_diff = set(encft_care_impute.columns) - set(enhogar_care_train.columns) - set(merge_cols)
encft_enhogar_care_diff

In [None]:
enhogar_care_X = enhogar_care_train.drop(enhogar_encft_care_diff, axis=1)

In [None]:
cdisplay(enhogar_care_X)

In [None]:
encft_care_X = encft_care_impute.drop(encft_enhogar_care_diff, axis=1)
encft_care_X = encft_care_X[merge_cols + list(enhogar_care_X.columns)]

In [None]:
cdisplay(encft_care_X)

## Domestic work drops

In [None]:
enhogar_encft_domestic_work_diff = (set(enhogar_domestic_work_train.columns) - set(encft_domestic_work_impute.columns))
enhogar_encft_domestic_work_diff

In [None]:
encft_enhogar_domestic_work_diff = set(encft_domestic_work_impute.columns) - set(enhogar_domestic_work_train.columns)  - set(merge_cols)
encft_enhogar_domestic_work_diff

In [None]:
enhogar_domestic_work_X = enhogar_domestic_work_train.drop(enhogar_encft_domestic_work_diff, axis=1)

In [None]:
cdisplay(enhogar_domestic_work_X)

In [None]:
encft_domestic_work_X = encft_domestic_work_impute.drop(encft_enhogar_domestic_work_diff, axis=1)
encft_domestic_work_X = encft_domestic_work_X[merge_cols + list(enhogar_domestic_work_X.columns)]

In [None]:
cdisplay(encft_domestic_work_X)

### Checks

In [None]:
from criteriaetl.utils.display_func import rdisplay

In [None]:
rdisplay((encft_domestic_work_X.mean()/encft_domestic_work_X.std() - enhogar_domestic_work_X.mean()/enhogar_domestic_work_X.std()).to_frame())

# Estimator

In [None]:
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report

from criteriaetl.impute.model_based import ScoreBasedStratifiedImputer

from projectetl.utils.estimator import get_estimator

In [None]:
weight_col = 'factor_expansion'

## Tiempo dedicado Cuidados

### Classification Estimator

In [None]:
classifcation_care_target_col = 'dedico_tiempo_cuidados'

#### Train

In [None]:
enhogar_care_y = enhogar_elected_encft_named[classifcation_care_target_col].fillna(0)
enhogar_care_weight = enhogar_elected_encft_named[weight_col]

In [None]:
logistic_lasso_care = LogisticRegression(penalty='l1',
                                         solver='liblinear',
                                         random_state=6202)

In [None]:
grid_logistic_lasso_care_kwargs = {
    'param_grid': {'C': np.array([.01, .1, 1, 10])},
    'verbose': 1,
    'scoring': 'neg_log_loss'
}

In [None]:
X_care_train, X_care_test = train_test_split(enhogar_care_X)
y_care_train, y_care_test = enhogar_care_y.loc[X_care_train.index.values], enhogar_care_y.loc[X_care_test.index.values]
weight_care_train, weight_care_test = enhogar_care_weight.loc[X_care_train.index.values], enhogar_care_weight.loc[X_care_test.index.values]

In [None]:
estimator_logistic_lasso_care, _ = get_estimator(logistic_lasso_care,
                                                 X=X_care_train,
                                                 y=y_care_train,
                                                 grid_kwargs=grid_logistic_lasso_care_kwargs,
                                                 weight=weight_care_train)

#### Validate

In [None]:
predicted_logistic_lasso_care = estimator_logistic_lasso_care.predict(X_care_test)
print(classification_report(y_care_test,
                            predicted_logistic_lasso_care,
                            sample_weight=weight_care_test))

#### Impute with predict

In [None]:
encft_to_impute_care = encft_care_X.copy()
encft_to_impute_care[classifcation_care_target_col] = 0
encft_to_impute_care[weight_col] = encft_member[weight_col]

In [None]:
encft_to_impute_care[classifcation_care_target_col] = estimator_logistic_lasso_care.predict(X=encft_care_X[enhogar_care_X.columns])

In [None]:
encft_to_impute_care.groupby('sexo')[classifcation_care_target_col].sum()

In [None]:
get_percentage_table_with_expansion_factor(encft_to_impute_care, classifcation_care_target_col, weight_col)

#### Impute with `ScoreBasedStratifiedImputer`

In [None]:
classification_care_strata_cols = ['sexo']
classification_care_score_col = 'score_col'
classification_care_candidate = 'candidate_col'

In [None]:
encft_to_impute_care = encft_care_X.copy()
encft_to_impute_care[classifcation_care_target_col] = 0
encft_to_impute_care[weight_col] = encft_member[weight_col]
encft_to_impute_care[classification_care_candidate] = 1
encft_to_impute_care[classification_care_score_col] = estimator_logistic_lasso_care.predict_proba(encft_care_X[enhogar_care_X.columns])[:, 1]

In [None]:
enhogar_care_sex_percent = pd.crosstab(index=[enhogar_care_X[col] for col in classification_care_strata_cols],
                                       columns=enhogar_care_y,
                                       values=enhogar_care_weight,
                                       aggfunc='sum').apply(lambda r: r/r.sum(), axis=1)[1]
classification_care_imputation_target_srs = encft_to_impute_care.groupby(classification_care_strata_cols)[weight_col].sum() * enhogar_care_sex_percent
display(classification_care_imputation_target_srs)

In [None]:
classifcation_care_stratified_imputer = ScoreBasedStratifiedImputer(target_col=classifcation_care_target_col,
                                                                    candidate_col=classification_care_candidate,
                                                                    strata_cols=classification_care_strata_cols,
                                                                    score_col=classification_care_score_col,
                                                                    imputation_target_srs=classification_care_imputation_target_srs,
                                                                    weight_col=weight_col)

In [None]:
encft_imputed_care = classifcation_care_stratified_imputer.fit_transform(encft_to_impute_care)

In [None]:
encft_imputed_care.groupby('sexo')[f'imputed_{classifcation_care_target_col}'].sum()

In [None]:
get_percentage_table_with_expansion_factor(encft_imputed_care, f'imputed_{classifcation_care_target_col}', weight_col)

In [None]:
enhogar_care_sex_percent

In [None]:
encft_care_sex_percent = pd.crosstab(index=[encft_care_X[col] for col in classification_care_strata_cols],
                                     columns=encft_imputed_care[f'imputed_{classifcation_care_target_col}'],
                                     values=encft_imputed_care[weight_col],
                                     aggfunc='sum').apply(lambda r: r/r.sum(), axis=1)[1.0]
encft_care_sex_percent

### Regresion Estimator

In [None]:
regression_care_target_col = 'tiempo_cuidados_horas_semanales'

#### Train

In [None]:
regression_care_enhogar_y = enhogar_elected_encft_named[regression_care_target_col].fillna(0)
regression_care_enhogar_weight = enhogar_elected_encft_named[weight_col]

In [None]:
lasso_care = Lasso(max_iter=100000)

In [None]:
alphas = [10**(-i) for i in range(5)]
grid_lasso_care_kwargs = {
    'param_grid': {'alpha': np.array(alphas)},
    'verbose': 1
}

In [None]:
regression_care_X_train, regression_care_X_test = train_test_split(enhogar_care_X)
regression_care_y_train, regression_care_y_test = \
    regression_care_enhogar_y.loc[regression_care_X_train.index.values], regression_care_enhogar_y.loc[regression_care_X_test.index.values]
regression_care_weight_train, regression_care_weight_test = \
    regression_care_enhogar_weight.loc[regression_care_X_train.index.values], regression_care_enhogar_weight.loc[regression_care_X_test.index.values]

In [None]:
regression_care_estimator_grid_lasso, _ = get_estimator(lasso_care,
                                                        X=regression_care_X_train,
                                                        y=regression_care_y_train,
                                                        grid_kwargs=grid_lasso_care_kwargs,
                                                        weight=regression_care_weight_train)

#### Impute with predict

In [None]:
regression_care_encft_X = encft_imputed_care[encft_imputed_care[f'imputed_{classifcation_care_target_col}'] == 1][enhogar_care_X.columns]

In [None]:
encft_imputed_care.loc[encft_imputed_care[f'imputed_{classifcation_care_target_col}'] == 1, regression_care_target_col] = regression_care_estimator_grid_lasso.predict(regression_care_encft_X)

In [None]:
encft_only_care = encft_imputed_care[encft_imputed_care[f'imputed_{classifcation_care_target_col}'] == 1]

In [None]:
# Weighted average per sex
encft_only_care['wa'] = encft_only_care[weight_col] / encft_only_care.groupby('sexo')[weight_col].transform('sum') * encft_only_care[regression_care_target_col]
encft_only_care.groupby('sexo')['wa'].sum()

## Tiempo dedicado a Trabajo Doméstico

### Classification Estimator

In [None]:
classifcation_domestic_work_target_col = 'dedico_tiempo_trabajo_doméstico_no_remunerado'

#### Train

In [None]:
enhogar_domestic_work_y = enhogar_elected_encft_named[classifcation_domestic_work_target_col].fillna(0)
enhogar_domestic_work_weight = enhogar_elected_encft_named[weight_col]

In [None]:
logistic_lasso_domestic_work = LogisticRegression(penalty='l1',
                                                  solver='liblinear',
                                                  random_state=6202)

In [None]:
grid_logistic_lasso_domestic_work_kwargs = {
    'param_grid': {'C': np.array([.01, .1, 1, 10])},
    'verbose': 1,
    'scoring': 'neg_log_loss'
}

In [None]:
X_domestic_work_train, X_domestic_work_test = train_test_split(enhogar_domestic_work_X)
y_domestic_work_train, y_domestic_work_test = enhogar_domestic_work_y.loc[X_domestic_work_train.index.values], enhogar_domestic_work_y.loc[X_domestic_work_test.index.values]
weight_domestic_work_train, weight_domestic_work_test = enhogar_domestic_work_weight.loc[X_domestic_work_train.index.values], enhogar_domestic_work_weight.loc[X_domestic_work_test.index.values]

In [None]:
estimator_logistic_lasso_domestic_work, _ = get_estimator(logistic_lasso_domestic_work,
                                                          X=X_domestic_work_train,
                                                          y=y_domestic_work_train,
                                                          grid_kwargs=grid_logistic_lasso_domestic_work_kwargs,
                                                          weight=weight_domestic_work_train)

#### Validate

In [None]:
predicted_logistic_lasso_domestic_work = estimator_logistic_lasso_domestic_work.predict(X_domestic_work_test)
print(classification_report(y_domestic_work_test,
                            predicted_logistic_lasso_domestic_work,
                            sample_weight=weight_domestic_work_test))

#### Impute with predict

In [None]:
encft_to_impute_domestic_work = encft_domestic_work_X.copy()
encft_to_impute_domestic_work[classifcation_domestic_work_target_col] = 0
encft_to_impute_domestic_work[weight_col] = encft_member[weight_col]

In [None]:
encft_to_impute_domestic_work[classifcation_domestic_work_target_col] = estimator_logistic_lasso_domestic_work.predict(X=encft_domestic_work_X[enhogar_domestic_work_X.columns])

In [None]:
encft_to_impute_domestic_work.groupby('sexo')[classifcation_domestic_work_target_col].sum()

In [None]:
get_percentage_table_with_expansion_factor(encft_to_impute_domestic_work, classifcation_domestic_work_target_col, weight_col)

#### Impute with `ScoreBasedStratifiedImputer`

In [None]:
classification_domestic_work_strata_cols = ['sexo']
classification_domestic_work_score_col = 'score_col'
classification_domestic_work_candidate = 'candidate_col'

In [None]:
encft_to_impute_domestic_work = encft_domestic_work_X.copy()
encft_to_impute_domestic_work[classifcation_domestic_work_target_col] = 0
encft_to_impute_domestic_work[weight_col] = encft_member[weight_col]
encft_to_impute_domestic_work[classification_domestic_work_candidate] = 1
encft_to_impute_domestic_work[classification_domestic_work_score_col] = estimator_logistic_lasso_domestic_work.predict_proba(encft_domestic_work_X[enhogar_domestic_work_X.columns])[:, 1]

In [None]:
enhogar_domestic_work_sex_percent = pd.crosstab(index=[enhogar_domestic_work_X[col] for col in classification_domestic_work_strata_cols],
                                                columns=enhogar_domestic_work_y,
                                                values=enhogar_domestic_work_weight,
                                                aggfunc='sum').apply(lambda r: r/r.sum(), axis=1)[1]
classification_domestic_work_imputation_target_srs = encft_to_impute_domestic_work.groupby(classification_domestic_work_strata_cols)[weight_col].sum() * enhogar_domestic_work_sex_percent
display(classification_domestic_work_imputation_target_srs)

In [None]:
classifcation_domestic_work_stratified_imputer = ScoreBasedStratifiedImputer(target_col=classifcation_domestic_work_target_col,
                                                                    candidate_col=classification_domestic_work_candidate,
                                                                    strata_cols=classification_domestic_work_strata_cols,
                                                                    score_col=classification_domestic_work_score_col,
                                                                    imputation_target_srs=classification_domestic_work_imputation_target_srs,
                                                                    weight_col=weight_col)

In [None]:
encft_imputed_domestic_work = classifcation_domestic_work_stratified_imputer.fit_transform(encft_to_impute_domestic_work)

In [None]:
encft_imputed_domestic_work.groupby('sexo')[f'imputed_{classifcation_domestic_work_target_col}'].sum()

In [None]:
get_percentage_table_with_expansion_factor(encft_imputed_domestic_work, f'imputed_{classifcation_domestic_work_target_col}', weight_col)

In [None]:
encft_domestic_work_sex_percent = pd.crosstab(index=[encft_domestic_work_X[col] for col in classification_domestic_work_strata_cols],
                                     columns=encft_imputed_domestic_work[f'imputed_{classifcation_domestic_work_target_col}'],
                                     values=encft_imputed_domestic_work[weight_col],
                                     aggfunc='sum').apply(lambda r: r/r.sum(), axis=1)[1.0]
encft_domestic_work_sex_percent

### Regresion Estimator

In [None]:
regression_domestic_work_target_col = 'tiempo_trabajo_doméstico_no_remunerado_horas_semanales'

#### Train

In [None]:
regression_domestic_work_enhogar_y = enhogar_elected_encft_named[regression_domestic_work_target_col].fillna(0)
regression_domestic_work_enhogar_weight = enhogar_elected_encft_named[weight_col]

In [None]:
lasso_domestic_work = Lasso(max_iter=100000)

In [None]:
alphas = [10**(-i) for i in range(5)]
grid_lasso_domestic_work_kwargs = {
    'param_grid': {'alpha': np.array(alphas)},
    'verbose': 1
}

In [None]:
regression_domestic_work_X_train, regression_domestic_work_X_test = train_test_split(enhogar_domestic_work_X)
regression_domestic_work_y_train, regression_domestic_work_y_test = \
    regression_domestic_work_enhogar_y.loc[regression_domestic_work_X_train.index.values], \
    regression_domestic_work_enhogar_y.loc[regression_domestic_work_X_test.index.values]
regression_domestic_work_weight_train, regression_domestic_work_weight_test = \
    regression_domestic_work_enhogar_weight.loc[regression_domestic_work_X_train.index.values], \
    regression_domestic_work_enhogar_weight.loc[regression_domestic_work_X_test.index.values]

In [None]:
regression_domestic_work_estimator_grid_lasso, _ = get_estimator(lasso_domestic_work,
                                                                 X=regression_domestic_work_X_train,
                                                                 y=regression_domestic_work_y_train,
                                                                 grid_kwargs=grid_lasso_domestic_work_kwargs,
                                                                 weight=regression_domestic_work_weight_train)

In [None]:
regression_domestic_work_estimator_grid_lasso.best_estimator_

#### Impute with predict

In [None]:
regression_domestic_work_encft_X = encft_imputed_domestic_work[encft_imputed_domestic_work[f'imputed_{classifcation_domestic_work_target_col}'] == 1][enhogar_domestic_work_X.columns]

In [None]:
encft_imputed_domestic_work.loc[encft_imputed_domestic_work[f'imputed_{classifcation_domestic_work_target_col}'] == 1, regression_domestic_work_target_col] = \
    regression_domestic_work_estimator_grid_lasso.predict(regression_domestic_work_encft_X)

In [None]:
encft_only_domestic_work = encft_imputed_domestic_work[encft_imputed_domestic_work[f'imputed_{classifcation_domestic_work_target_col}'] == 1]

In [None]:
# Weighted average per sex
encft_only_domestic_work['wa'] = encft_only_domestic_work[weight_col] / encft_only_domestic_work.groupby('sexo')[weight_col].transform('sum') * encft_only_domestic_work[regression_domestic_work_target_col]
encft_only_domestic_work.groupby('sexo')['wa'].sum()

# Merge imputations

In [None]:
merge_care_transformer = MergeTransformer(lambda : encft_imputed_care[merge_cols + [f'imputed_{classifcation_care_target_col}', regression_care_target_col]],
                                          merge_kwargs={'on': merge_cols, 'how': 'outer'})
merge_domestic_work_transformer = MergeTransformer(lambda : encft_imputed_domestic_work[merge_cols + [f'imputed_{classifcation_domestic_work_target_col}', regression_domestic_work_target_col]],
                                                   merge_kwargs={'on': merge_cols, 'how': 'outer'})
encft_member_imputed = make_pipeline(
    merge_care_transformer,
    merge_domestic_work_transformer
).transform(encft_member_raw)

In [None]:
cdisplay(encft_member_imputed)

# Save imputed ENCFT

In [None]:
save_survey_with_pickle(encft_member_imputed, ENCFT_OBJECT_DIR / 'encft-2020-use-time.pkl')