# Base de datos diario -ICC

**Objetivo**
Con este notebook se reproduce el excel que ICC maneja como insumo para sus calculos, reporte y demas.

In [1]:
# check donde estamos trabajando
#pwd

In [2]:
# libreria para movernos entre diferentes rutas
#import os

#os.chdir('../../')

In [3]:
# Libreías generales
import numpy as np
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta



# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
import matplotlib.font_manager
from matplotlib import style
style.use('ggplot') or plt.style.use('ggplot')
import seaborn as sns



# Preprocesado y modelado
# ==============================================================================
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors


import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [4]:
#comando para visualizar todas las columnas del df
pd.pandas.set_option('display.max_columns', None)

# features load
los datos aqui cargados sufireron una limpieza en el notebook 

In [5]:
dfs = pd.read_parquet('icc_stations_all.parquet')
dfs.head()

Unnamed: 0,estacion,fecha,temperatura,radiacion,humedad_relativa,precipitacion,velocidad_viento,mojadura,direccion_viento,presion_atm
0,MAGPVD,2016-08-26 00:15:00,,,,,,,,
1,MAGPVD,2016-08-26 00:30:00,,,,,,,,
2,MAGPVD,2016-08-26 00:45:00,,,,,,,,
3,MAGPVD,2016-08-26 01:00:00,,,,,,,,
4,MAGPVD,2016-08-26 01:15:00,,,,,,,,


# Agregacion diaria

In [6]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11501043 entries, 0 to 11501042
Data columns (total 10 columns):
 #   Column            Dtype         
---  ------            -----         
 0   estacion          object        
 1   fecha             datetime64[ns]
 2   temperatura       float64       
 3   radiacion         float64       
 4   humedad_relativa  float64       
 5   precipitacion     float64       
 6   velocidad_viento  float64       
 7   mojadura          float64       
 8   direccion_viento  float64       
 9   presion_atm       float64       
dtypes: datetime64[ns](1), float64(8), object(1)
memory usage: 877.5+ MB


In [7]:
def segmentacion(df, col_fecha):
    """
    función que segmenta la fecha en dia, mes, año y semana.
    
    Arguments:
        df: dataframe 
        col_fecha: columna que contiene la informacion de la fecha
        
    Returns:
        df: dataframe de entrada con 5 nuevas columnas que informan sobre laç
        fecha dd/mm/aa, semana, dia, mes y año
    """
    df['date'] = pd.to_datetime(df[col_fecha]).dt.date
    df['semana'] = pd.to_datetime(df[col_fecha]).dt.week
    df['dia'] = pd.to_datetime(df[col_fecha]).dt.day
    df['mes'] = pd.to_datetime(df[col_fecha]).dt.month
    df['año'] = pd.to_datetime(df[col_fecha]).dt.year
    return df

In [8]:
#aqui se llama la funcion y se pone los parametros
segmentacion(dfs, 'fecha')

Unnamed: 0,estacion,fecha,temperatura,radiacion,humedad_relativa,precipitacion,velocidad_viento,mojadura,direccion_viento,presion_atm,date,semana,dia,mes,año
0,MAGPVD,2016-08-26 00:15:00.000,,,,,,,,,2016-08-26,34,26,8,2016
1,MAGPVD,2016-08-26 00:30:00.000,,,,,,,,,2016-08-26,34,26,8,2016
2,MAGPVD,2016-08-26 00:45:00.000,,,,,,,,,2016-08-26,34,26,8,2016
3,MAGPVD,2016-08-26 01:00:00.000,,,,,,,,,2016-08-26,34,26,8,2016
4,MAGPVD,2016-08-26 01:15:00.000,,,,,,,,,2016-08-26,34,26,8,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11501038,ICCCHI,2022-08-14 22:44:59.950,25.2,0.0,100.0,0.0,0.0,10.0,82.8,,2022-08-14,32,14,8,2022
11501039,ICCCHI,2022-08-14 22:59:59.950,25.2,0.0,100.0,0.0,0.0,10.0,116.4,,2022-08-14,32,14,8,2022
11501040,ICCCHI,2022-08-14 23:14:59.950,25.2,0.0,100.0,0.0,0.0,10.0,70.8,,2022-08-14,32,14,8,2022
11501041,ICCCHI,2022-08-14 23:29:59.950,25.2,0.0,100.0,0.0,0.0,10.0,101.3,,2022-08-14,32,14,8,2022


# Agregaciones diarias
con la siguiente linea se construyen las columnas agregadas que tenia carlos en el excel

In [9]:
df_diario = dfs.groupby(['estacion', 'semana','dia','mes','año']).agg(
    temperatura_min_diaria=('temperatura', 'min'),
    temperatura_max_diaria=('temperatura', 'max'),
    temperatura_promedio_diaria=('temperatura', 'mean'),
    radiacion_diaria_acumulada=('radiacion', 'sum'),
    radiacion_diaria_promedio = ('radiacion','mean'),
    humedad_relativa_min_diaria = ('humedad_relativa', 'min'),
    humedad_relativa_max_diaria = ('humedad_relativa', 'max'),
    humedad_relativa_media_diaria = ('humedad_relativa', 'mean'),
    lluvia_diaria = ('precipitacion', 'sum'),
    velocidad_viento_media_diario = ('velocidad_viento', 'mean'),
    velocidad_viento_max_diaria = ('velocidad_viento', 'max')
).reset_index()

In [10]:
df_diario['amplitud_termica'] = df_diario.temperatura_max_diaria - df_diario.temperatura_min_diaria

# constantes para algunos calculos de variables
las tablas que se consultan fueron sacadas del excel compartido por ICC. se esta adelantando la revision bibliografica para generalizar estas tablas a otras estaciones.

In [11]:
"""
constantes de radiacion extraterreste y numero de dias despejado
"""

ra_n_ctes = pd.read_excel('tabla_Ra_N.xlsx').iloc[:-1]
ra_n_ctes.pop("Unnamed: 22")
ra_n_ctes.head()

Unnamed: 0,dia,1_N,1_Ra,2_N,2_Ra,3_N,3_Ra,4_N,4_Ra,5_N,5_Ra,6_N,6_Ra,7_N,7_Ra,8_N,8_Ra,9_N,9_Ra,10_N,10_Ra,11_N,11_Ra,12_N,12_Ra
0,1,11.176,28.99,11.39,31.372,11.721,34.588,12.14,37.39,12.519,38.396,12.787,38.273,12.828,38.105,12.621,38.122,12.255,37.426,11.85,35.222,11.462,31.851,11.21,29.273
1,2,11.18,29.03,11.4,31.48,11.734,34.7,12.153,37.452,12.53,38.405,12.793,38.264,12.825,38.105,12.611,38.118,12.242,37.379,11.837,35.124,11.452,31.743,11.205,29.219
2,3,11.183,29.072,11.411,31.589,11.747,34.811,12.166,37.511,12.541,38.411,12.798,38.254,12.822,38.105,12.601,38.112,12.229,37.329,11.824,35.024,11.441,31.636,11.2,29.169
3,4,11.187,29.117,11.421,31.699,11.76,34.921,12.18,37.568,12.552,38.417,12.802,38.245,12.819,38.106,12.59,38.106,12.216,37.279,11.81,34.922,11.43,31.529,11.195,29.121
4,5,11.191,29.165,11.432,31.81,11.773,35.03,12.193,37.623,12.563,38.421,12.807,38.236,12.815,38.106,12.58,38.099,12.202,37.226,11.797,34.82,11.419,31.424,11.191,29.076


In [12]:
def extraer_info(df, filtros_str, name_feature, name_col):
    """
    Como la tabla tiene la informacion de radiacion general y radiacion en dia p, se hace un filtro
    para separalas. como hacemos el filtro nos queda en las columnas los meses y en las filas con el dia.
    es necesario quitar  el sufijo lpara solo quedarnos con el numero de meses.

    la matriz anterior se re diseña para que el mes ya no quede como columna sino como filas
    y los valores de la radiacion no quede en la matriz sino en una columna

    """
    matriz=df.filter(regex=filtros_str)
    matriz.columns = matriz.columns.str.replace("_" + name_feature, "")
    df_feature = matriz.melt(id_vars=["dia"], 
        var_name="mes", 
        value_name=name_col)
    df_feature.mes = df_feature.mes.astype('int')
    return df_feature

In [13]:
df_n = extraer_info(ra_n_ctes, 'dia|N', 'N', 'N Daylight hours')
df_ra = extraer_info(ra_n_ctes, 'dia|Ra', 'Ra', 'Ra')

In [14]:
ctes_radiacion = pd.read_excel('constantes_radiacion.xlsx')
ctes_radiacion.head()

Unnamed: 0,dia,1_rgl,1_rdp,2_rgl,2_rdp,3_rgl,3_rdp,4_rgl,4_rdp,5_rgl,5_rdp,6_rgl,6_rdp,7_rgl,7_rdp,8_rgl,8_rdp,9_rgl,9_rdp,10_rgl,10_rdp,11_rgl,11_rdp,12_rgl,12_rdp
0,1,336.3,254.1,363.8,274.9,402.3,304.0,434.3,328.1,445.5,336.6,443.9,335.4,442.0,334.0,442.2,334.1,434.1,328.0,408.6,308.7,369.5,279.2,339.5,256.5
1,2,336.7,254.4,365.1,275.8,403.6,305.0,435.0,328.7,445.5,336.7,443.8,335.3,442.0,334.0,442.2,334.1,433.6,327.6,407.4,307.9,368.2,278.2,338.9,256.1
2,3,337.2,254.8,366.3,276.8,404.9,305.9,435.6,329.2,445.6,336.7,443.7,335.2,442.0,334.0,442.1,334.1,433.0,327.2,406.3,307.0,367.0,277.3,338.3,255.6
3,4,337.8,255.2,367.6,277.7,406.1,306.9,436.3,329.6,445.7,336.7,443.6,335.2,442.0,334.0,442.0,334.0,432.4,326.7,405.1,306.1,365.7,276.4,337.8,255.2
4,5,338.3,255.6,368.9,278.7,407.4,307.8,436.9,330.1,445.7,336.8,443.5,335.1,442.0,334.0,441.9,333.9,431.8,326.3,403.9,305.2,364.5,275.4,337.2,254.8


In [15]:
df_rg = extraer_info(ctes_radiacion, 'rgl|dia', 'rgl', 'radiacion_global')
df_rdp = extraer_info(ctes_radiacion, 'rdp|dia', 'rdp', 'radiacion_dia_despejado')

## Join all dfs

In [16]:
"""
se unen los 3 dataframes donde las columnas comunes o las llaves son dia y mes y se 
reescribe el df_diario

"""
df_diario  = df_diario.merge(df_rg, how='inner', on=['dia', 'mes']).merge(df_rdp, how='inner', on=['dia', 'mes'])
df_diario = df_diario.merge(df_n, how='inner', on=['dia', 'mes']).merge(df_ra, how='inner', on=['dia', 'mes'])
df_diario

Unnamed: 0,estacion,semana,dia,mes,año,temperatura_min_diaria,temperatura_max_diaria,temperatura_promedio_diaria,radiacion_diaria_acumulada,radiacion_diaria_promedio,humedad_relativa_min_diaria,humedad_relativa_max_diaria,humedad_relativa_media_diaria,lluvia_diaria,velocidad_viento_media_diario,velocidad_viento_max_diaria,amplitud_termica,radiacion_global,radiacion_dia_despejado,N Daylight hours,Ra
0,CASSAAGS,1,3,1,2022,21.4,34.5,27.184375,23096.50000,240.588542,44.0,100.0,79.260417,0.0,6.528421,23.0,13.1,337.2,254.8,11.183,29.072
1,CASSAAGS,53,3,1,2021,21.5,33.5,26.898958,21606.40000,225.066667,47.0,100.0,81.177083,0.0,5.842708,16.5,12.0,337.2,254.8,11.183,29.072
2,CASSACIZ,1,3,1,2020,21.6,30.4,25.312500,13596.30000,141.628125,57.0,91.0,73.968750,0.0,7.712500,15.6,8.8,337.2,254.8,11.183,29.072
3,CASSACIZ,1,3,1,2022,22.4,34.1,26.969792,20672.50000,215.338542,43.0,100.0,77.083333,0.0,10.581250,28.8,11.7,337.2,254.8,11.183,29.072
4,CASSACIZ,53,3,1,2021,21.4,32.6,25.936458,15596.30000,162.461458,49.0,98.0,78.104167,0.0,6.311458,12.3,11.2,337.2,254.8,11.183,29.072
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119801,TBUPUY,9,29,2,2016,21.0,34.7,26.740625,20228.40000,210.712500,47.0,100.0,81.322917,0.0,6.248958,15.2,13.7,401.0,303.0,11.714,34.531
119802,TBUPUY,9,29,2,2020,19.1,37.2,26.497917,24770.16000,258.022500,41.0,100.0,82.447917,0.0,5.488542,24.6,18.1,401.0,303.0,11.714,34.531
119803,TULTLA,9,29,2,2012,19.9,34.1,25.833333,19328.00000,201.333333,39.0,97.0,72.531250,0.0,6.373958,25.8,14.2,401.0,303.0,11.714,34.531
119804,TULTLA,9,29,2,2016,22.1,32.6,26.055208,14122.00000,147.104167,54.0,100.0,87.833333,0.4,6.271875,12.2,10.5,401.0,303.0,11.714,34.531


## Calculo de variables secundarias (las variables que requieren calculos)

In [17]:
# el raound es para el redondeo
df_diario['radiacion_media_estimada Heargreaves'] = round(
    0.16 * (np.sqrt(df_diario.amplitud_termica)) * df_diario.radiacion_global,
    1
)


In [18]:
df_diario['Rg']=0.00089681*df_diario.radiacion_diaria_acumulada

In [19]:
df_diario['n']=round(
    (-0.32 +1.61*(df_diario['Rg']/df_diario['Ra']))*df_diario['N Daylight hours'],
    1
)

In [20]:
df_diario.head()

Unnamed: 0,estacion,semana,dia,mes,año,temperatura_min_diaria,temperatura_max_diaria,temperatura_promedio_diaria,radiacion_diaria_acumulada,radiacion_diaria_promedio,humedad_relativa_min_diaria,humedad_relativa_max_diaria,humedad_relativa_media_diaria,lluvia_diaria,velocidad_viento_media_diario,velocidad_viento_max_diaria,amplitud_termica,radiacion_global,radiacion_dia_despejado,N Daylight hours,Ra,radiacion_media_estimada Heargreaves,Rg,n
0,CASSAAGS,1,3,1,2022,21.4,34.5,27.184375,23096.5,240.588542,44.0,100.0,79.260417,0.0,6.528421,23.0,13.1,337.2,254.8,11.183,29.072,195.3,20.713172,9.2
1,CASSAAGS,53,3,1,2021,21.5,33.5,26.898958,21606.4,225.066667,47.0,100.0,81.177083,0.0,5.842708,16.5,12.0,337.2,254.8,11.183,29.072,186.9,19.376836,8.4
2,CASSACIZ,1,3,1,2020,21.6,30.4,25.3125,13596.3,141.628125,57.0,91.0,73.96875,0.0,7.7125,15.6,8.8,337.2,254.8,11.183,29.072,160.0,12.193298,4.0
3,CASSACIZ,1,3,1,2022,22.4,34.1,26.969792,20672.5,215.338542,43.0,100.0,77.083333,0.0,10.58125,28.8,11.7,337.2,254.8,11.183,29.072,184.5,18.539305,7.9
4,CASSACIZ,53,3,1,2021,21.4,32.6,25.936458,15596.3,162.461458,49.0,98.0,78.104167,0.0,6.311458,12.3,11.2,337.2,254.8,11.183,29.072,180.6,13.986918,5.1
