In [94]:
import polars as pl

"""
Importing the datasets we will merge
"""
df_concentradohogar = pl.read_csv('selected_concentradohogar.csv')
df_viviendas = pl.read_csv('viviendas.csv',null_values=["&", ""])
df_clima = pl.read_csv('hogares_vulnerables_climaticos.csv')
df_gastohogar = pl.read_csv('gastoshogar.csv')#

# He retirado este conjunto de datos porque su información se utilizó para definir los hogares vulnerables al clima.
# df_hogares = pl.read_csv('selected_hogares_cleaned.csv')

In [95]:
"""
Processing "gastoshogar.csv" for merge with the rest csv files.
First, we have to select the most important fields
"""
df_gastohogar = df_gastohogar.select([
    'folioviv',
    'clave',
    'tipo_gasto',
    #'mes_dia',
    #'lugar_comp',
    #'orga_inst',
    #'frecuencia',
    #'fecha_adqu',
    #'fecha_pago',
    'gasto',
    #'costo',
    #'gasto_tri'
])

In [96]:
"""
Second, we filter this DF by the selected values:
1. clave: 043111 & 043201
2. tipo_gasto: G1
"""

df_filtrado = df_gastohogar.filter(
    pl.col('clave').is_in(['043111', '043201'])&
    pl.col('tipo_gasto').is_in(['G1'])
)
df_filtrado

folioviv,clave,tipo_gasto,gasto
i64,str,str,str
100001905,"""043111""","""G1""","""750"""
100003701,"""043201""","""G1""","""600"""
100003702,"""043111""","""G1""","""2500"""
100003702,"""043201""","""G1""","""1000"""
100003706,"""043201""","""G1""","""800"""
…,…,…,…
3260592412,"""043111""","""G1""","""250"""
3260592618,"""043111""","""G1""","""180"""
3260593508,"""043111""","""G1""","""40000"""
3260593512,"""043111""","""G1""","""2250"""


In [97]:
"""
Replace 'tipo_gasto' values. In this case, we only have to replace one value 'G1'.
"""

df_filtrado = df_filtrado.with_columns(
    pl.col('tipo_gasto').replace(
        'G1', 1
    )
)

In [98]:
df_filtrado = df_filtrado.with_columns(
    pl.col(pl.Utf8)
        .replace({"": None, " ": None})
)

In [99]:
"""
Change dtypes from str to int
"""
df_filtrado = df_filtrado.with_columns(pl.col(pl.Utf8).cast(pl.Float64))
df_filtrado.describe()

statistic,folioviv,clave,tipo_gasto,gasto
str,f64,f64,f64,f64
"""count""",9067.0,9067.0,9067.0,9067.0
"""null_count""",0.0,0.0,0.0,0.0
"""mean""",1679200000.0,43141.403662,1.0,2761.575935
"""std""",942330000.0,42.569317,0.0,6440.168261
"""min""",100001905.0,43111.0,1.0,0.0
"""25%""",863194716.0,43111.0,1.0,500.0
"""50%""",1665600000.0,43111.0,1.0,1200.0
"""75%""",2508300000.0,43201.0,1.0,2900.0
"""max""",3260600000.0,43201.0,1.0,230000.0


In [100]:
df_viviendas = df_viviendas.select([
      'folioviv', #(PRIMARY KEY)
      'ubica_geo', #(FOREIGN KEY)
      'agua_ent',
      'agua_noe',
      'dotac_agua',
      'ab_agua',
      'mat_pisos',
      'mat_pared',
      'mat_techos',
      'drenaje',
      'excusado',
      'uso_compar',
      'sanit_agua',
      'p_grietas',
      'p_pandeos',
      'p_levanta',
      'p_humedad',
      'p_fractura',
      'p_electric',
      'p_tuberias'
])

In [101]:
df_concentradohogar = df_concentradohogar.select([
      'folioviv',
      'ubica_geo',
      'ing_cor'
])


In [102]:
"""
Merging datasets by "folioviv" field
"""
df_merged = (
    df_concentradohogar
    #.join(df_hogares, on="folioviv", how="left")
    .join(df_viviendas, on="folioviv", how="left")
    .join(df_clima, on='folioviv', how='left')
    .join(df_filtrado, on='folioviv', how='left')
)

In [103]:
df_merged.describe()

statistic,folioviv,ubica_geo,ing_cor,ubica_geo_right,agua_ent,agua_noe,dotac_agua,ab_agua,mat_pisos,mat_pared,mat_techos,drenaje,excusado,uso_compar,sanit_agua,p_grietas,p_pandeos,p_levanta,p_humedad,p_fractura,p_electric,p_tuberias,camb_clim,climate_vulnerability_intensity,clave,tipo_gasto,gasto
str,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",95465.0,95465.0,95465.0,95465.0,95465.0,"""4351""",91114.0,91114.0,95464.0,95465.0,95465.0,95465.0,95465.0,93925.0,93925.0,95465.0,95465.0,95465.0,95465.0,95465.0,95465.0,95465.0,95465.0,95465.0,9590.0,9590.0,9590.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,"""91114""",4351.0,4351.0,1.0,0.0,0.0,0.0,0.0,1540.0,1540.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,85875.0,85875.0,85875.0
"""mean""",1616900000.0,15921.873922,72180.080779,15921.873922,1.281276,,1.619257,1.317372,2.456088,7.783387,8.398533,1.505662,1.08732,1.944743,1.346542,1.613921,1.878175,1.88158,1.564186,1.930886,1.946745,1.97949,0.169738,1.367234,43141.406674,1.0,2712.514494
"""std""",932280000.0,9316.761232,93190.224047,9316.761232,0.541587,,1.073918,0.924306,0.553629,0.732184,2.789666,0.928413,0.334603,0.228482,0.58431,0.590819,0.525288,0.502802,0.596304,0.540317,0.514326,0.524656,0.375404,3.025489,42.570222,0.0,6320.540968
"""min""",100001901.0,1001.0,0.0,1001.0,1.0,"""1""",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,43111.0,1.0,0.0
"""25%""",802841804.0,8019.0,34389.0,8019.0,1.0,,1.0,1.0,2.0,8.0,7.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,0.0,0.0,43111.0,1.0,500.0
"""50%""",1565400000.0,15122.0,55493.88,15122.0,1.0,,1.0,1.0,2.0,8.0,10.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,43111.0,1.0,1150.0
"""75%""",2408000000.0,24028.0,88698.04,24028.0,1.0,,2.0,1.0,3.0,8.0,10.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,43201.0,1.0,2800.0
"""max""",3260600000.0,32056.0,17432000.0,32056.0,3.0,"""6""",5.0,7.0,3.0,8.0,10.0,5.0,3.0,2.0,3.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,1.0,12.0,43201.0,1.0,230000.0


In [104]:
"""
Taking string columns
"""
cols_str = df_merged.select(pl.col(pl.Utf8)).columns
cols_str

['agua_noe']

In [105]:
"""
Verify whether the column contains decimals
"""
for col in cols_str:
    print(df_merged.select(pl.col(col).value_counts()))

shape: (7, 1)
┌──────────────┐
│ agua_noe     │
│ ---          │
│ struct[2]    │
╞══════════════╡
│ {"3",463}    │
│ {"5",1570}   │
│ {"6",256}    │
│ {"2",164}    │
│ {"1",1497}   │
│ {"4",401}    │
│ {null,91114} │
└──────────────┘


In [106]:
"""
Change dtypes from str to int
"""
df_merged = df_merged.with_columns(pl.col(pl.Utf8).cast(pl.Float64))
df_merged.describe()

statistic,folioviv,ubica_geo,ing_cor,ubica_geo_right,agua_ent,agua_noe,dotac_agua,ab_agua,mat_pisos,mat_pared,mat_techos,drenaje,excusado,uso_compar,sanit_agua,p_grietas,p_pandeos,p_levanta,p_humedad,p_fractura,p_electric,p_tuberias,camb_clim,climate_vulnerability_intensity,clave,tipo_gasto,gasto
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",95465.0,95465.0,95465.0,95465.0,95465.0,4351.0,91114.0,91114.0,95464.0,95465.0,95465.0,95465.0,95465.0,93925.0,93925.0,95465.0,95465.0,95465.0,95465.0,95465.0,95465.0,95465.0,95465.0,95465.0,9590.0,9590.0,9590.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,91114.0,4351.0,4351.0,1.0,0.0,0.0,0.0,0.0,1540.0,1540.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,85875.0,85875.0,85875.0
"""mean""",1616900000.0,15921.873922,72180.080779,15921.873922,1.281276,3.264537,1.619257,1.317372,2.456088,7.783387,8.398533,1.505662,1.08732,1.944743,1.346542,1.613921,1.878175,1.88158,1.564186,1.930886,1.946745,1.97949,0.169738,1.367234,43141.406674,1.0,2712.514494
"""std""",932280000.0,9316.761232,93190.224047,9316.761232,0.541587,1.846557,1.073918,0.924306,0.553629,0.732184,2.789666,0.928413,0.334603,0.228482,0.58431,0.590819,0.525288,0.502802,0.596304,0.540317,0.514326,0.524656,0.375404,3.025489,42.570222,0.0,6320.540968
"""min""",100001901.0,1001.0,0.0,1001.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,43111.0,1.0,0.0
"""25%""",802841804.0,8019.0,34389.0,8019.0,1.0,1.0,1.0,1.0,2.0,8.0,7.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,0.0,0.0,43111.0,1.0,500.0
"""50%""",1565400000.0,15122.0,55493.88,15122.0,1.0,4.0,1.0,1.0,2.0,8.0,10.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,43111.0,1.0,1150.0
"""75%""",2408000000.0,24028.0,88698.04,24028.0,1.0,5.0,2.0,1.0,3.0,8.0,10.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,43201.0,1.0,2800.0
"""max""",3260600000.0,32056.0,17432000.0,32056.0,3.0,6.0,5.0,7.0,3.0,8.0,10.0,5.0,3.0,2.0,3.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,1.0,12.0,43201.0,1.0,230000.0


In [107]:
print(df_merged.select(pl.col('climate_vulnerability_intensity').value_counts()))

shape: (6, 1)
┌─────────────────────────────────┐
│ climate_vulnerability_intensit… │
│ ---                             │
│ struct[2]                       │
╞═════════════════════════════════╡
│ {10,33}                         │
│ {11,2}                          │
│ {0,79261}                       │
│ {9,815}                         │
│ {8,15353}                       │
│ {12,1}                          │
└─────────────────────────────────┘


In [108]:
df_merged.columns

['folioviv',
 'ubica_geo',
 'ing_cor',
 'ubica_geo_right',
 'agua_ent',
 'agua_noe',
 'dotac_agua',
 'ab_agua',
 'mat_pisos',
 'mat_pared',
 'mat_techos',
 'drenaje',
 'excusado',
 'uso_compar',
 'sanit_agua',
 'p_grietas',
 'p_pandeos',
 'p_levanta',
 'p_humedad',
 'p_fractura',
 'p_electric',
 'p_tuberias',
 'camb_clim',
 'climate_vulnerability_intensity',
 'clave',
 'tipo_gasto',
 'gasto']

In [90]:
df_merged['dotac_agua'].value_counts()

dotac_agua,count
i64,u32
3.0,5577
,4351
1.0,60582
4.0,4406
2.0,16715
5.0,3834


In [109]:
"""
Creating new columns:
1. saneamiento_deficiente
    - excusado = 3
    - uso_compar = 3
    - drenaje = 3 & drenaje = 5
    - agua_ent = 3 & drenaje = 4
2. agua_intermitente
    - dotac_agua != 1
3. gasto_adaptacion = gasto/ing_cor
"""

# 1. saneamiento_deficiente
df_merged = df_merged.with_columns(
    pl.when(
        (pl.col('excusado') == 3) |
        (pl.col('uso_compar') == 3)|
        (pl.col('drenaje').is_in([3, 5]))|
        (
            (pl.col('drenaje') == 4) &
            (pl.col('agua_ent') == 3)
        )
    )
    .then(1)
    .otherwise(0)
    .alias('saneamiento_deficiente')
)

# 2. agua_intermitente
df_merged = df_merged.with_columns(
    pl.when(pl.col('dotac_agua') != 1)
    .then(1)
    .otherwise(0)
    .alias('agua_intermitente')
)

# 3. gasto_adaptacion
df_merged = df_merged.with_columns(
    (pl.col('gasto') / pl.col('ing_cor')).alias('gasto_adaptacion')
)

In [110]:
df_merged['saneamiento_deficiente'].value_counts()

saneamiento_deficiente,count
i32,u32
0,89723
1,5742


In [111]:
df_merged['agua_intermitente'].value_counts()

agua_intermitente,count
i32,u32
1,30532
0,64933


In [112]:
df_merged['gasto_adaptacion'].describe()

statistic,value
str,f64
"""count""",9590.0
"""null_count""",85875.0
"""mean""",0.040854
"""std""",0.10697
"""min""",0.0
"""25%""",0.006753
"""50%""",0.015624
"""75%""",0.037769
"""max""",3.296839


In [113]:
"""
Shape for each file
"""
dfs = [df_clima, df_concentradohogar, df_filtrado, df_hogares, df_viviendas, df_merged]
for df in dfs:
    print(f'Shape: {df.shape}')

Shape: (91414, 3)
Shape: (91414, 3)
Shape: (9067, 4)
Shape: (91414, 22)
Shape: (90324, 20)
Shape: (95465, 30)


In [115]:
df_merged.write_csv('merged_dataset_enunciado.csv')