# Monthly data generation

In [1]:
import pandas as pd
import glob
import os

from geopy.distance import geodesic
import plotly.express as px
import geopandas as gpd

In [2]:
cd csicap_fedearroz_evaluacion/

[Errno 2] No such file or directory: 'csicap_fedearroz_evaluacion/'
/home/jupyter/csicap_fedearroz_evaluacion


In [3]:
info_arroz = pd.read_parquet('data/arroz_obs_sat.parquet')

In [4]:
info_arroz['month_year'] = info_arroz['date'].dt.to_period('M')
info_arroz.head()

Unnamed: 0,latitud,longitud,dpto,mun,date,precipitation,station,evento_lluvia,fuente,month_year
0,4.39333,-72.98889,META,UPIA,2015-07-06,0.0,BARRANCA_DE_UPIA,0.0,fedearroz,2015-07
1,4.39333,-72.98889,META,UPIA,2015-07-07,0.0,BARRANCA_DE_UPIA,0.0,fedearroz,2015-07
2,4.39333,-72.98889,META,UPIA,2015-07-08,,BARRANCA_DE_UPIA,,fedearroz,2015-07
3,4.39333,-72.98889,META,UPIA,2015-07-09,,BARRANCA_DE_UPIA,,fedearroz,2015-07
4,4.39333,-72.98889,META,UPIA,2015-07-10,,BARRANCA_DE_UPIA,,fedearroz,2015-07


# monthly events

In [6]:
def max_consecutive_zeros(series):
    """Función para contar el máximo número de ceros consecutivos."""
    max_count = (series.eq(0).cumsum() - series.eq(0).cumsum().where(~series.eq(0)).ffill().fillna(0)).max()
    return max_count


summary = info_arroz[info_arroz.fuente=='fedearroz'].groupby(['dpto','mun','station', 'month_year']).agg(
    num_registros=('precipitation', 'size'),
    num_nans=('precipitation', lambda x: x.isna().sum()),
    num_ceros=('precipitation', lambda x: (x == 0).sum()),
    max_ceros_consecutivos=('precipitation', lambda x: max_consecutive_zeros(x))
).reset_index()
summary.head()


Unnamed: 0,dpto,mun,station,month_year,num_registros,num_nans,num_ceros,max_ceros_consecutivos
0,ANTIOQUIA,NECHI,FEDEARROZ_NECHI_SANTA_CLARA,2011-09,8,0,4,2.0
1,ANTIOQUIA,NECHI,FEDEARROZ_NECHI_SANTA_CLARA,2011-10,31,0,8,6.0
2,ANTIOQUIA,NECHI,FEDEARROZ_NECHI_SANTA_CLARA,2011-11,30,0,6,1.0
3,ANTIOQUIA,NECHI,FEDEARROZ_NECHI_SANTA_CLARA,2011-12,31,0,11,4.0
4,ANTIOQUIA,NECHI,FEDEARROZ_NECHI_SANTA_CLARA,2012-01,31,0,28,20.0


In [7]:
summary.to_excel('data/summary_monthly_prec.xlsx')

# monthly aggregation

In [8]:
# hasta 3 dias faltantes para que se agrege el mes
filtered_missing_data_count = summary[summary['num_nans'] <= 3]

In [9]:
info_arroz_filtered = pd.merge(info_arroz, filtered_missing_data_count[['station', 'month_year']], 
                               on=['station', 'month_year'], how='inner')


sum_prec_obs = info_arroz_filtered.groupby(['latitud','longitud','dpto','mun','station', 'month_year', 'fuente']).agg(prec_month=('precipitation','sum')).reset_index()
sum_prec_obs

Unnamed: 0,latitud,longitud,dpto,mun,station,month_year,fuente,prec_month
0,2.69814,-75.29815,HUILA,CAMPOALEGRE,FEDEARROZ_CAMPOALEGRE_ALTAGRACIA,2011-11,agera5-precipitation,228.520000
1,2.69814,-75.29815,HUILA,CAMPOALEGRE,FEDEARROZ_CAMPOALEGRE_ALTAGRACIA,2011-11,chirps-precipitation,436.494230
2,2.69814,-75.29815,HUILA,CAMPOALEGRE,FEDEARROZ_CAMPOALEGRE_ALTAGRACIA,2011-11,fedearroz,513.200000
3,2.69814,-75.29815,HUILA,CAMPOALEGRE,FEDEARROZ_CAMPOALEGRE_ALTAGRACIA,2011-12,agera5-precipitation,164.050001
4,2.69814,-75.29815,HUILA,CAMPOALEGRE,FEDEARROZ_CAMPOALEGRE_ALTAGRACIA,2011-12,chirps-precipitation,242.482750
...,...,...,...,...,...,...,...,...
11851,10.87405,-72.85299,GUAJIRA,FONSECA,FEDEARROZ_FONSECA_GRANJA_ITA,2023-11,chirps-precipitation,190.283857
11852,10.87405,-72.85299,GUAJIRA,FONSECA,FEDEARROZ_FONSECA_GRANJA_ITA,2023-11,fedearroz,412.410000
11853,10.87405,-72.85299,GUAJIRA,FONSECA,FEDEARROZ_FONSECA_GRANJA_ITA,2023-12,agera5-precipitation,26.520000
11854,10.87405,-72.85299,GUAJIRA,FONSECA,FEDEARROZ_FONSECA_GRANJA_ITA,2023-12,chirps-precipitation,15.484619


In [11]:
sum_prec_obs.to_parquet('data/prec_monthly_obs_sat.parquet')