In [None]:
from pathlib import Path

In [None]:
from itables import init_notebook_mode

import pandas as pd

In [None]:
init_notebook_mode(all_interactive=True)

In [None]:
dataset_path = Path.cwd().parent / 'datasets'

In [None]:
survey_filepath = dataset_path / 'raw' / 'enaho/2023/data.sav'

# Reading Dataset

In [None]:
df = pd.read_spss(survey_filepath)

In [None]:
df.info()

In [None]:
df.head()

# Renaming Variables

In [None]:
special_chars = set(
    chr
    for name in df.columns.str.findall('[^a-zA-Z0-9_]')
    for chr in name
)
print(f'Special characters in variables: {special_chars}')

In [None]:
df.columns = df.columns.str.replace('$', '_', regex=False)
df.columns = df.columns.str.replace('Ñ', 'N', regex=False)

In [None]:
df.head()

# Variable Selection

In [None]:
df.info()

In [None]:
household_info_vars = [
    'ANO', 'MES', 
    'CONGLOME', 'VIVIENDA', 'HOGAR',
    'LONGITUD', 'LATITUD'
]

In [None]:
household_char_vars = [
    'P24A', 'P24B',
    'P25_1', 'P25_2', 'P25_3', 'P25_4', 'P25_5',
    'P101', 'P102', 'P103', 'P103A', 
    'P104', 'P104A', 'P104B1', 'P104B2',
    'P110', 'P110A1', 'P110A', 'P110A_MODIFICADA', 
    'P110C', 'P110C1', 'P110C2', 'P110C3', 'P110', 'P110E',
    'P111A', 'P1121', 'P1123', 'P1124', 'P1125', 'P1126', 'P1127', 'P112A',
    'P1141', 'P1142', 'P1143', 'P1144', 'P114B1', 'P114B2', 'P114B3', 'P1145' 
]

In [None]:
vars = household_info_vars + ['NBI1']
df = df[vars]

df.head()

# Refinement

## Rename

In [None]:
df.columns = ['year', 'month', 'conglomerate', 'house', 'household', 'longitude', 'latitude', 'adequate']

df.head()

## Remove Null Values

In [None]:
df = df.dropna()

In [None]:
df.info()

## Transform Data

In [None]:
df.dtypes

In [None]:
df['adequate'] = df['adequate'].map({'Vivienda adecuada': True, 'Vivienda inadecuada': False})
df = df.astype({'year': 'int', 'month': 'int', 'adequate': 'bool'})

In [None]:
df.dtypes

# Aggreate Values

In [None]:
df = (
    df.groupby(['year', 'month', 'conglomerate', 'longitude', 'latitude'])
    ['adequate'].all().reset_index(name='adequate')
)

In [None]:
df.tail()

# Export

In [None]:
df.info()

In [None]:
df.to_pickle(dataset_path / 'clean' / 'conglomerate.pkl')