# Preprocessing

In [118]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [119]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [120]:
dataset = pd.read_csv('../data/data.csv')

In [121]:
# pd.set_option('display.max_columns', 30)
# print(dataset.info())
for col, dtype in dataset.dtypes.items():
    print(f"{col}: {dtype}")

pseudo_id: int64
today: object
interviewtype: object
monitor_gender: object
interview_province: object
interview_district: object
site_001: object
site_type: object
assessment_modality: object
are_you_headhh: object
respondent_gender: object
status: object
province: object
district: object
head_age_group: object
marital_status: object
widow_o_martyrdom_certi_cate: object
what_is_your_occupation_what_i: object
what_is_sizeyour_famil: object
male_0_6: float64
male_7_14: float64
male_15_17: float64
male_18_59: float64
male_60: float64
female_0_6: float64
female_7_14: float64
female_15_17: float64
female_18_59: float64
female_60: float64
have_children: object
have_children_001: object
hh_info_person_health: float64
hh_info_legal_needs: float64
hh_info_school_dropout: float64
hh_info_unable_work: float64
hh_info_child_armed_group: float64
hh_info_family_unity: float64
hh_info_drug_dependence: float64
hh_info_none: float64
hh_info_oth: float64
hh_info_002_person_health: float64
hh_info_002_l

In [122]:
# --- 1️⃣ Identificar tipos de columnas ---
numericas = dataset.select_dtypes(include=['int64', 'float64']).columns.tolist()
categoricas = dataset.select_dtypes(include=['object']).columns.tolist()

In [123]:
# --- 2️⃣ Rellenar valores faltantes ---
# Numéricas: con la mediana
num_imputer = SimpleImputer(strategy='median')
dataset[numericas] = num_imputer.fit_transform(dataset[numericas])

In [124]:
# Categóricas: con 'Unknown'
cat_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')
dataset[categoricas] = cat_imputer.fit_transform(dataset[categoricas])

In [125]:
# --- 3️⃣ Convertir categóricas a códigos (enum) ---
for col in categoricas:
    dataset[col] = dataset[col].astype('category')  # convertir a categórica si no lo estaba
    print(f"Columna: {col}")
    print(dataset[col].cat.categories)  # muestra las categorías de texto
    print()

Columna: today
Index(['2024/01/02 00:00:00', '2024/01/03 00:00:00', '2024/01/04 00:00:00',
       '2024/01/07 00:00:00', '2024/01/08 00:00:00', '2024/01/09 00:00:00',
       '2024/01/10 00:00:00', '2024/01/11 00:00:00', '2024/01/14 00:00:00',
       '2024/01/15 00:00:00',
       ...
       '2024/10/27 00:00:00', '2024/10/28 00:00:00', '2024/10/29 00:00:00',
       '2024/10/30 00:00:00', '2024/10/31 00:00:00', '2024/11/02 00:00:00',
       '2024/11/03 00:00:00', '2024/11/04 00:00:00', '2024/11/05 00:00:00',
       '2024/11/06 00:00:00'],
      dtype='object', length=249)

Columna: interviewtype
Index(['hh'], dtype='object')

Columna: monitor_gender
Index(['female', 'male'], dtype='object')

Columna: interview_province
Index(['BDG', 'BDK', 'BGN', 'BLK', 'BMY', 'DKY', 'FRH', 'FYB', 'GHR', 'GZN',
       'HLD', 'HRT', 'JWZ', 'KBL', 'KDH', 'KDZ', 'KNR', 'KPS', 'KST', 'LGN',
       'LGR', 'NGR', 'NMR', 'NRT', 'PKT', 'PNR', 'PTY', 'SMN', 'SPL', 'TKH',
       'URZ', 'Unknown', 'WDK', 'ZBL'],
  

In [126]:
# --- 4️⃣ Escalar columnas numéricas ---
# scaler = StandardScaler()
# dataset[numericas] = scaler.fit_transform(dataset[numericas])

In [129]:
# pd.set_option('display.max_columns', None)
# print(dataset.info())
# for col, dtype in dataset.dtypes.items():
    # print(f"{col}: {dtype}")

for col in categoricas:
    print(f"Columna: {col}")
    print(dataset[col].astype('category').cat.categories)
    print()

dataset.to_csv('../data/preprocessed_data.csv', index=False)

Columna: today
Index(['2024/01/02 00:00:00', '2024/01/03 00:00:00', '2024/01/04 00:00:00',
       '2024/01/07 00:00:00', '2024/01/08 00:00:00', '2024/01/09 00:00:00',
       '2024/01/10 00:00:00', '2024/01/11 00:00:00', '2024/01/14 00:00:00',
       '2024/01/15 00:00:00',
       ...
       '2024/10/27 00:00:00', '2024/10/28 00:00:00', '2024/10/29 00:00:00',
       '2024/10/30 00:00:00', '2024/10/31 00:00:00', '2024/11/02 00:00:00',
       '2024/11/03 00:00:00', '2024/11/04 00:00:00', '2024/11/05 00:00:00',
       '2024/11/06 00:00:00'],
      dtype='object', length=249)

Columna: interviewtype
Index(['hh'], dtype='object')

Columna: monitor_gender
Index(['female', 'male'], dtype='object')

Columna: interview_province
Index(['BDG', 'BDK', 'BGN', 'BLK', 'BMY', 'DKY', 'FRH', 'FYB', 'GHR', 'GZN',
       'HLD', 'HRT', 'JWZ', 'KBL', 'KDH', 'KDZ', 'KNR', 'KPS', 'KST', 'LGN',
       'LGR', 'NGR', 'NMR', 'NRT', 'PKT', 'PNR', 'PTY', 'SMN', 'SPL', 'TKH',
       'URZ', 'Unknown', 'WDK', 'ZBL'],
  