# merging of satellite and ground data

In [1]:
import pandas as pd
import glob
import os

from geopy.distance import geodesic

In [2]:
prec = pd.read_parquet('data/fedearroz_prec_diaria_2024.parquet')


In [3]:
prec= prec.rename(columns={'lat':'latitud','lon':'longitud', 'prec':'precipitation'})
prec['fuente']='ideam'

# Loading of satellital data

In [4]:
def generate_dates(start_date, end_date, num_rows):
    date_range = pd.date_range(start=start_date, end=end_date, periods=num_rows)
    return date_range

In [5]:
file_pattern_chirps = "./chirps_data/chirps-precipitation_*.csv"
files_chirps = glob.glob(file_pattern_chirps)

file_pattern_agera5 = "./agera5_data/agera5-precipitation_*.csv"
files_agera5 = glob.glob(file_pattern_agera5)


files_combined = files_chirps + files_agera5

In [6]:
files_combined[:2]

['./chirps_data/chirps-precipitation_-74.91457_4.72333_2020-01-01_2020-12-31.csv',
 './chirps_data/chirps-precipitation_-74.26311_10.62164_2015-01-01_2015-12-31.csv']

In [7]:
grouped_data = []

for file in files_combined:
    # Extraer la información del nombre del archivo
    basename = os.path.basename(file)
    parts = basename.replace('.csv', '').split('_')
    #latitud y longitud si viene en el archivo
    start_date = parts[3]
    end_date = parts[4]
    longitud=parts[1]
    latitud=parts[2]
    source=parts[0]
    
    df = pd.read_csv(file)
    num_rows = len(df)
    df = df.rename(columns={'precipitation': 'precipitation'})
    df = df.rename(columns={'Precipitation_Flux': 'precipitation'})
    df['date'] = generate_dates(start_date, end_date, num_rows)
    df['longitud']=float(longitud)
    df['latitud']=float(latitud)
    df['fuente']=source
    df=df[['longitud','latitud', 'date','precipitation','fuente']]
    grouped_data.append(df)



In [8]:
arroz_sat = pd.concat(grouped_data)
arroz_sat['precipitation'] = arroz_sat['precipitation'].fillna(0)
arroz_sat

Unnamed: 0,longitud,latitud,date,precipitation,fuente
0,-74.91457,4.72333,2020-01-01,0.00,chirps-precipitation
1,-74.91457,4.72333,2020-01-02,0.00,chirps-precipitation
2,-74.91457,4.72333,2020-01-03,0.00,chirps-precipitation
3,-74.91457,4.72333,2020-01-04,0.00,chirps-precipitation
4,-74.91457,4.72333,2020-01-05,0.00,chirps-precipitation
...,...,...,...,...,...
360,-74.76385,4.78134,2023-12-27,0.00,agera5-precipitation
361,-74.76385,4.78134,2023-12-28,0.22,agera5-precipitation
362,-74.76385,4.78134,2023-12-29,0.46,agera5-precipitation
363,-74.76385,4.78134,2023-12-30,1.04,agera5-precipitation


def distance_cal(row):
    # Coordenadas aproximadas (lat, lon)
    coords_aprox = (row['lat'], row['lon'])
    
    # Coordenadas reales (latitud, longitud)
    coords_reales = (row['latitud'], row['longitud'])
    
    # Calcular la distancia usando geodesic de geopy
    distancia = geodesic(coords_aprox, coords_reales).kilometers
    
    return distancia


df_unicos['distancia'] = df_unicos.apply(distance_cal, axis=1)

df_unicos.head()


In [9]:
arroz_sat.columns

Index(['longitud', 'latitud', 'date', 'precipitation', 'fuente'], dtype='object')

In [10]:
prec.columns

Index(['latitud', 'longitud', 'dpto', 'mun', 'date', 'precipitation',
       'station', 'evento_lluvia', 'fuente'],
      dtype='object')

In [11]:
arroz_info_sat = pd.merge(arroz_sat, prec[['latitud', 'longitud', 'dpto', 'mun', 'station']].drop_duplicates(keep='first').reset_index(drop=True),
                            how='inner', on=['longitud', 'latitud'])
arroz_info_sat.tail()

Unnamed: 0,longitud,latitud,date,precipitation,fuente,dpto,mun,station
320653,-74.76385,4.78134,2023-12-27,0.0,agera5-precipitation,TOLIMA,AMBALEMA,FEDEARROZ_AMBALEMA_GAMBA
320654,-74.76385,4.78134,2023-12-28,0.22,agera5-precipitation,TOLIMA,AMBALEMA,FEDEARROZ_AMBALEMA_GAMBA
320655,-74.76385,4.78134,2023-12-29,0.46,agera5-precipitation,TOLIMA,AMBALEMA,FEDEARROZ_AMBALEMA_GAMBA
320656,-74.76385,4.78134,2023-12-30,1.04,agera5-precipitation,TOLIMA,AMBALEMA,FEDEARROZ_AMBALEMA_GAMBA
320657,-74.76385,4.78134,2023-12-31,0.07,agera5-precipitation,TOLIMA,AMBALEMA,FEDEARROZ_AMBALEMA_GAMBA


In [12]:
arroz_obs_sat = pd.concat([prec, arroz_info_sat], ignore_index=True)

In [13]:
arroz_obs_sat[(arroz_obs_sat['longitud']==-73.68548) &
                 (arroz_obs_sat['latitud']==3.82888)& (arroz_obs_sat['date']==pd.to_datetime('2015-01-11'))]

Unnamed: 0,latitud,longitud,dpto,mun,date,precipitation,station,evento_lluvia,fuente
42206,3.82888,-73.68548,META,CASTILLA LA NUEVA,2015-01-11,4.0,FEDEARROZ_CASTILLA_LA_NUEVA_CAPACHOS,1.0,ideam
255722,3.82888,-73.68548,META,CASTILLA LA NUEVA,2015-01-11,0.0,FEDEARROZ_CASTILLA_LA_NUEVA_CAPACHOS,,chirps-precipitation
443799,3.82888,-73.68548,META,CASTILLA LA NUEVA,2015-01-11,2.41,FEDEARROZ_CASTILLA_LA_NUEVA_CAPACHOS,,agera5-precipitation


In [15]:
arroz_obs_sat.to_parquet('data/arroz_obs_sat.parquet')