In [32]:
import polars as pl

"""
Importing the datasets we will merge
"""
df_concentradohogar = pl.read_csv('selected_concentradohogar.csv')
df_hogares = pl.read_csv('selected_hogares_cleaned.csv')
df_viviendas = pl.read_csv('viviendas.csv',null_values=["&", ""])
df_clima = pl.read_csv('hogares_vulnerables_climaticos.csv')
df_gastohogar = pl.read_csv('gastoshogar.csv')

In [33]:
"""
Processing "gastoshogar.csv" for merge with the rest csv files. 
First, we have to select the most important fields
"""
df_gastohogar = df_gastohogar.select([
    'folioviv',
    'clave',
    'tipo_gasto',
    'mes_dia',
    'lugar_comp',
    'orga_inst',
    'frecuencia',
    'fecha_adqu',
    'fecha_pago',
    'gasto',
    'costo',
    'gasto_tri'   
])

In [34]:
"""
Second, we filter this DF by the selected values:
1. clave: 043111 & 043201
2. tipo_gasto: G1
"""

df_filtrado = df_gastohogar.filter(
    pl.col('clave').is_in(['043111', '043201'])&
    pl.col('tipo_gasto').is_in(['G1'])
)
df_filtrado

folioviv,clave,tipo_gasto,mes_dia,lugar_comp,orga_inst,frecuencia,fecha_adqu,fecha_pago,gasto,costo,gasto_tri
i64,str,str,i64,i64,i64,i64,i64,i64,str,str,str
100001905,"""043111""","""G1""",0,5,0,0,0,0,"""750""",""" ""","""366.84"""
100003701,"""043201""","""G1""",0,17,0,0,0,0,"""600""",""" ""","""295.08"""
100003702,"""043111""","""G1""",0,5,0,0,0,0,"""2500""",""" ""","""1229.5"""
100003702,"""043201""","""G1""",0,17,0,0,0,0,"""1000""",""" ""","""491.8"""
100003706,"""043201""","""G1""",0,17,0,0,0,0,"""800""",""" ""","""393.44"""
…,…,…,…,…,…,…,…,…,…,…,…
3260592412,"""043111""","""G1""",0,17,0,0,0,0,"""250""",""" ""","""122.28"""
3260592618,"""043111""","""G1""",0,5,0,0,0,0,"""180""",""" ""","""88.52"""
3260593508,"""043111""","""G1""",0,5,0,0,0,0,"""40000""",""" ""","""19565.21"""
3260593512,"""043111""","""G1""",0,5,0,0,0,0,"""2250""",""" ""","""1100.54"""


In [35]:
"""
Replace 'tipo_gasto' values. In this case, we only have to replace one value 'G1'.
"""

df_filtrado = df_filtrado.with_columns(
    pl.col('tipo_gasto').replace(
        'G1', 1
    )
)

In [36]:
df_filtrado = df_filtrado.with_columns(
    pl.col(pl.Utf8)
        .replace({"": None, " ": None})
)

In [37]:
"""
Change dtypes from str to int
"""
df_filtrado = df_filtrado.with_columns(pl.col(pl.Utf8).cast(pl.Float64))
df_filtrado.describe()

statistic,folioviv,clave,tipo_gasto,mes_dia,lugar_comp,orga_inst,frecuencia,fecha_adqu,fecha_pago,gasto,costo,gasto_tri
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",9067.0,9067.0,9067.0,9067.0,9067.0,9067.0,9067.0,9067.0,9067.0,9067.0,19.0,9067.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9048.0,0.0
"""mean""",1679200000.0,43141.403662,1.0,0.0,9.404434,0.0,0.0,5.033528,5.034741,2761.575935,3962.105263,1354.7974
"""std""",942330000.0,42.569317,0.0,0.0,5.777169,0.0,0.0,109.853564,109.880011,6440.168261,3955.020128,3162.220669
"""min""",100001905.0,43111.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,300.0,0.0
"""25%""",863194716.0,43111.0,1.0,0.0,5.0,0.0,0.0,0.0,0.0,500.0,2490.0,244.56
"""50%""",1665600000.0,43111.0,1.0,0.0,5.0,0.0,0.0,0.0,0.0,1200.0,3000.0,586.95
"""75%""",2508300000.0,43201.0,1.0,0.0,17.0,0.0,0.0,0.0,0.0,2900.0,5000.0,1428.26
"""max""",3260600000.0,43201.0,1.0,0.0,18.0,0.0,0.0,2409.0,2411.0,230000.0,17500.0,113736.26


In [38]:
df_viviendas = df_viviendas.select([
    'folioviv',
    'tot_resid',
    'mat_pared',
    'mat_techos',
    'ab_agua',
    'agua_noe',
    'dotac_agua',
    'agua_ent',
    'sanit_agua',
    'drenaje',
    'uso_compar',
    'excusado',
    
])

In [39]:
"""
Merging datasets by "folioviv" field
"""
df_merged = (
    df_concentradohogar
    .join(df_hogares, on="folioviv", how="inner")
    .join(df_viviendas, on="folioviv", how="inner")
    .join(df_clima, on='folioviv', how='inner')
    .join(df_filtrado, on='folioviv', how='inner')
)

In [40]:
df_merged.describe()

statistic,folioviv,foliohog,ing_cor,gasto_mon,ubica_geo,tam_loc,acc_alim2,acc_alim7,acc_alim9,celular,tsalud1_h,tsalud1_m,camb_clim,f_sequia,f_inunda,f_helada,f_incendio,f_huracan,f_desliza,f_otro,af_viv,af_empleo,af_negocio,af_cultivo,af_trabajo,af_salud,af_otro,tot_resid,mat_pared,mat_techos,ab_agua,agua_noe,dotac_agua,agua_ent,sanit_agua,drenaje,uso_compar,excusado,camb_clim_right,climate_vulnerability_intensity,clave,tipo_gasto,mes_dia,lugar_comp,orga_inst,frecuencia,fecha_adqu,fecha_pago,gasto,costo,gasto_tri
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,str,str,str,str,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,3122.0,3122.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,"""2049""","""2049""","""2049""","""2049""","""2049""","""2049""","""2049""",10434.0,10434.0,10434.0,10123.0,"""311""",10123.0,10434.0,10353.0,10434.0,10353.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,19.0,10434.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,7312.0,7312.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""8385""","""8385""","""8385""","""8385""","""8385""","""8385""","""8385""",0.0,0.0,0.0,311.0,"""10123""",311.0,0.0,81.0,0.0,81.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10415.0,0.0
"""mean""",1683100000.0,1.098141,89488.030284,56460.764307,16638.893329,2.259344,0.057984,0.264894,0.060218,0.965497,0.32787,18.288576,0.196377,0.093349,0.047345,0.006613,0.0023,0.045333,0.0023,0.009584,,,,,,,,3.879337,7.856527,8.840521,1.229082,,1.655339,1.184014,1.224379,1.366686,1.950449,1.042841,0.196377,1.581464,43141.552041,1.0,0.0,9.447192,0.0,0.0,4.374066,4.37512,2632.95668,3962.105263,1291.691926
"""std""",937300000.0,0.350747,96360.273764,51869.833095,9366.366619,1.299353,0.233723,0.441348,0.237928,0.182525,0.979878,13.1693,0.397276,0.290934,0.212386,0.081055,0.047907,0.208042,0.047907,0.097433,,,,,,,,2.329493,0.571724,2.470387,0.768229,,1.123498,0.458024,0.474806,0.717239,0.217026,0.237775,0.397276,3.200978,42.619603,0.0,0.0,5.787951,0.0,0.0,102.418199,102.442856,6138.013666,3955.020128,3013.586235
"""min""",100001905.0,1.0,2967.03,97.82,1001.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""0.0""","""0.0""","""0.0""","""0.0""","""0.0""","""0.0""","""0.0""",1.0,1.0,1.0,1.0,"""1""",1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,43111.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,300.0,0.0
"""25%""",903263404.0,1.0,44266.28,28263.41,9004.0,1.0,0.0,0.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,2.0,8.0,10.0,1.0,,1.0,1.0,1.0,1.0,2.0,1.0,0.0,0.0,43111.0,1.0,0.0,5.0,0.0,0.0,0.0,0.0,460.0,2490.0,226.22
"""50%""",1665600000.0,1.0,69098.35,43694.36,16102.0,2.0,0.0,0.0,0.0,1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,3.0,8.0,10.0,1.0,,1.0,1.0,1.0,1.0,2.0,1.0,0.0,0.0,43111.0,1.0,0.0,5.0,0.0,0.0,0.0,0.0,1071.0,3000.0,525.81
"""75%""",2508300000.0,1.0,108654.55,66905.98,25006.0,4.0,0.0,1.0,0.0,1.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,5.0,8.0,10.0,1.0,,2.0,1.0,1.0,2.0,2.0,1.0,0.0,0.0,43201.0,1.0,0.0,17.0,0.0,0.0,0.0,0.0,2600.0,5000.0,1271.73
"""max""",3260600000.0,3.0,4753000.0,1635900.0,32056.0,4.0,1.0,1.0,1.0,1.0,40.0,55.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"""1.0""","""1.0""","""1.0""","""1.0""","""1.0""","""1.0""","""1.0""",16.0,8.0,10.0,7.0,"""6""",5.0,3.0,3.0,5.0,2.0,3.0,1.0,11.0,43201.0,1.0,0.0,18.0,0.0,0.0,2409.0,2411.0,230000.0,17500.0,113736.26


In [41]:
"""
Taking string columns
"""
cols_str = df_merged.select(pl.col(pl.Utf8)).columns
cols_str

['af_viv',
 'af_empleo',
 'af_negocio',
 'af_cultivo',
 'af_trabajo',
 'af_salud',
 'af_otro',
 'agua_noe']

In [42]:
"""
Verify whether the column contains decimals
"""
for col in cols_str:
    print(df_merged.select(pl.col(col).value_counts()))

shape: (3, 1)
┌──────────────┐
│ af_viv       │
│ ---          │
│ struct[2]    │
╞══════════════╡
│ {null,8385}  │
│ {"1.0",1019} │
│ {"0.0",1030} │
└──────────────┘
shape: (3, 1)
┌──────────────┐
│ af_empleo    │
│ ---          │
│ struct[2]    │
╞══════════════╡
│ {null,8385}  │
│ {"0.0",1813} │
│ {"1.0",236}  │
└──────────────┘
shape: (3, 1)
┌──────────────┐
│ af_negocio   │
│ ---          │
│ struct[2]    │
╞══════════════╡
│ {null,8385}  │
│ {"0.0",1923} │
│ {"1.0",126}  │
└──────────────┘
shape: (3, 1)
┌──────────────┐
│ af_cultivo   │
│ ---          │
│ struct[2]    │
╞══════════════╡
│ {"0.0",1569} │
│ {null,8385}  │
│ {"1.0",480}  │
└──────────────┘
shape: (3, 1)
┌──────────────┐
│ af_trabajo   │
│ ---          │
│ struct[2]    │
╞══════════════╡
│ {"0.0",1921} │
│ {null,8385}  │
│ {"1.0",128}  │
└──────────────┘
shape: (3, 1)
┌──────────────┐
│ af_salud     │
│ ---          │
│ struct[2]    │
╞══════════════╡
│ {"1.0",169}  │
│ {null,8385}  │
│ {"0.0",1880} │
└──────────────

In [43]:
"""
Change dtypes from str to int
"""
df_merged = df_merged.with_columns(pl.col(pl.Utf8).cast(pl.Float64))
df_merged.describe()

statistic,folioviv,foliohog,ing_cor,gasto_mon,ubica_geo,tam_loc,acc_alim2,acc_alim7,acc_alim9,celular,tsalud1_h,tsalud1_m,camb_clim,f_sequia,f_inunda,f_helada,f_incendio,f_huracan,f_desliza,f_otro,af_viv,af_empleo,af_negocio,af_cultivo,af_trabajo,af_salud,af_otro,tot_resid,mat_pared,mat_techos,ab_agua,agua_noe,dotac_agua,agua_ent,sanit_agua,drenaje,uso_compar,excusado,camb_clim_right,climate_vulnerability_intensity,clave,tipo_gasto,mes_dia,lugar_comp,orga_inst,frecuencia,fecha_adqu,fecha_pago,gasto,costo,gasto_tri
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,3122.0,3122.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,2049.0,2049.0,2049.0,2049.0,2049.0,2049.0,2049.0,10434.0,10434.0,10434.0,10123.0,311.0,10123.0,10434.0,10353.0,10434.0,10353.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,10434.0,19.0,10434.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,7312.0,7312.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8385.0,8385.0,8385.0,8385.0,8385.0,8385.0,8385.0,0.0,0.0,0.0,311.0,10123.0,311.0,0.0,81.0,0.0,81.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10415.0,0.0
"""mean""",1683100000.0,1.098141,89488.030284,56460.764307,16638.893329,2.259344,0.057984,0.264894,0.060218,0.965497,0.32787,18.288576,0.196377,0.093349,0.047345,0.006613,0.0023,0.045333,0.0023,0.009584,0.497316,0.115178,0.061493,0.234261,0.062469,0.082479,0.253294,3.879337,7.856527,8.840521,1.229082,3.356913,1.655339,1.184014,1.224379,1.366686,1.950449,1.042841,0.196377,1.581464,43141.552041,1.0,0.0,9.447192,0.0,0.0,4.374066,4.37512,2632.95668,3962.105263,1291.691926
"""std""",937300000.0,0.350747,96360.273764,51869.833095,9366.366619,1.299353,0.233723,0.441348,0.237928,0.182525,0.979878,13.1693,0.397276,0.290934,0.212386,0.081055,0.047907,0.208042,0.047907,0.097433,0.500115,0.319315,0.240292,0.423639,0.242065,0.275161,0.435004,2.329493,0.571724,2.470387,0.768229,1.845118,1.123498,0.458024,0.474806,0.717239,0.217026,0.237775,0.397276,3.200978,42.619603,0.0,0.0,5.787951,0.0,0.0,102.418199,102.442856,6138.013666,3955.020128,3013.586235
"""min""",100001905.0,1.0,2967.03,97.82,1001.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,43111.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,300.0,0.0
"""25%""",903263404.0,1.0,44266.28,28263.41,9004.0,1.0,0.0,0.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,8.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,0.0,43111.0,1.0,0.0,5.0,0.0,0.0,0.0,0.0,460.0,2490.0,226.22
"""50%""",1665600000.0,1.0,69098.35,43694.36,16102.0,2.0,0.0,0.0,0.0,1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,8.0,10.0,1.0,4.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,0.0,43111.0,1.0,0.0,5.0,0.0,0.0,0.0,0.0,1071.0,3000.0,525.81
"""75%""",2508300000.0,1.0,108654.55,66905.98,25006.0,4.0,0.0,1.0,0.0,1.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,8.0,10.0,1.0,5.0,2.0,1.0,1.0,2.0,2.0,1.0,0.0,0.0,43201.0,1.0,0.0,17.0,0.0,0.0,0.0,0.0,2600.0,5000.0,1271.73
"""max""",3260600000.0,3.0,4753000.0,1635900.0,32056.0,4.0,1.0,1.0,1.0,1.0,40.0,55.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,16.0,8.0,10.0,7.0,6.0,5.0,3.0,3.0,5.0,2.0,3.0,1.0,11.0,43201.0,1.0,0.0,18.0,0.0,0.0,2409.0,2411.0,230000.0,17500.0,113736.26


In [44]:
print(df_merged.select(pl.col('climate_vulnerability_intensity').value_counts()))

shape: (4, 1)
┌─────────────────────────────────┐
│ climate_vulnerability_intensit… │
│ ---                             │
│ struct[2]                       │
╞═════════════════════════════════╡
│ {9,106}                         │
│ {11,1}                          │
│ {8,1942}                        │
│ {0,8385}                        │
└─────────────────────────────────┘


In [45]:
df_merged.columns

['folioviv',
 'foliohog',
 'ing_cor',
 'gasto_mon',
 'ubica_geo',
 'tam_loc',
 'acc_alim2',
 'acc_alim7',
 'acc_alim9',
 'celular',
 'tsalud1_h',
 'tsalud1_m',
 'camb_clim',
 'f_sequia',
 'f_inunda',
 'f_helada',
 'f_incendio',
 'f_huracan',
 'f_desliza',
 'f_otro',
 'af_viv',
 'af_empleo',
 'af_negocio',
 'af_cultivo',
 'af_trabajo',
 'af_salud',
 'af_otro',
 'tot_resid',
 'mat_pared',
 'mat_techos',
 'ab_agua',
 'agua_noe',
 'dotac_agua',
 'agua_ent',
 'sanit_agua',
 'drenaje',
 'uso_compar',
 'excusado',
 'camb_clim_right',
 'climate_vulnerability_intensity',
 'clave',
 'tipo_gasto',
 'mes_dia',
 'lugar_comp',
 'orga_inst',
 'frecuencia',
 'fecha_adqu',
 'fecha_pago',
 'gasto',
 'costo',
 'gasto_tri']

In [46]:
df_merged['dotac_agua'].value_counts()

dotac_agua,count
i64,u32
,311
5.0,514
2.0,1846
3.0,571
4.0,530
1.0,6662


In [49]:
"""
Creating new columns:
1. saneamiento_deficiente
    - excusado = 3
    - uso_compar = 3
    - drenaje = 3 & drenaje = 5
    - agua_ent = 3 & drenaje = 4
2. agua_intermitente
    - dotac_agua != 1
3. gasto_adaptacion = gasto/ing_cor
"""

# 1. saneamiento_deficiente
df_merged = df_merged.with_columns(
    pl.when(
        (pl.col('excusado') == 3) |
        (pl.col('uso_compar') == 3)|
        (pl.col('drenaje').is_in([3, 5]))|
        (
            (pl.col('drenaje') == 4) &
            (pl.col('agua_ent') == 3)
        )
    )
    .then(1)
    .otherwise(0)
    .alias('saneamiento_deficiente')
)

# 2. agua_intermitente
df_merged = df_merged.with_columns(
    pl.when(pl.col('dotac_agua') != 1)
    .then(1)
    .otherwise(0)
    .alias('agua_intermitente')
)

# 3. gasto_adaptacion
df_merged = df_merged.with_columns(
    (pl.col('gasto') / pl.col('ing_cor')).alias('gasto_adaptacion')
)

In [48]:
df_merged['saneamiento_deficiente'].value_counts()

saneamiento_deficiente,count
i32,u32
1,334
0,10100


In [50]:
df_merged['agua_intermitente'].value_counts()

agua_intermitente,count
i32,u32
1,3461
0,6973


In [52]:
df_merged['gasto_adaptacion'].describe()

statistic,value
str,f64
"""count""",10434.0
"""null_count""",0.0
"""mean""",0.041785
"""std""",0.112971
"""min""",0.0
"""25%""",0.006599
"""50%""",0.015491
"""75%""",0.037894
"""max""",3.296839


In [54]:
df_merged.write_csv('merged_dataset.csv')