In [35]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

Leemos los datos ya procesados de processed_data_energy.csv.

In [23]:
processed_path = '../data/processed/processed_data_energy.csv'

# Leemos el csv
df = pd.read_csv(processed_path)
df.head()

Unnamed: 0,hora,dia,mes,anio,tmax-cab,tmax-hmo,tmax-obr,tmax-lmo,tmax-cul,tmin-cab,...,martes_postfestivo,semana_santa,1_mayo,10_mayo,16_sep,2_nov.,pre-navidad_y_new_year,navidad_y_new_year,post-navidad_y_new_year,demanda_energia
0,0,1,1,2007,21.0,22.0,25.0,30.0,29.0,2.0,...,0,0,0,0,0,0,0,1,0,1394
1,1,1,1,2007,21.0,22.0,25.0,30.0,29.0,2.0,...,0,0,0,0,0,0,0,1,0,1297
2,2,1,1,2007,21.0,22.0,25.0,30.0,29.0,2.0,...,0,0,0,0,0,0,0,1,0,1255
3,3,1,1,2007,21.0,22.0,25.0,30.0,29.0,2.0,...,0,0,0,0,0,0,0,1,0,1222
4,4,1,1,2007,21.0,22.0,25.0,30.0,29.0,2.0,...,0,0,0,0,0,0,0,1,0,1168


In [24]:
df.columns

Index(['hora', 'dia', 'mes', 'anio', 'tmax-cab', 'tmax-hmo', 'tmax-obr',
       'tmax-lmo', 'tmax-cul', 'tmin-cab', 'tmin-hmo', 'tmin-obr', 'tmin-lmo',
       'tmin-cul', 'prec_hmo_mm', 'prec_obr_mm', 'prec_lmo_mm', 'prec_cul_mm',
       'lunes_festivo', 'martes_postfestivo', 'semana_santa', '1_mayo',
       '10_mayo', '16_sep', '2_nov.', 'pre-navidad_y_new_year',
       'navidad_y_new_year', 'post-navidad_y_new_year', 'demanda_energia'],
      dtype='object')

Embebemos las columnas provenientes de la fecha en un mismo dato datetime. Cambiamos los nombres de anio, mes, dia y hora a year, month, day, hour; esto nos facilita el trabajo de embeber estos datos a una variable datetime, con la función de pandas to_datetime(), ya que esta función toma estos nombres para incoporarlos. Posteriormente, eliminamos estas columnas.

In [25]:
df.rename(columns={"anio": "year", "mes": "month", "dia": "day", "hora": "hour"}, inplace=True)
df["datetime"] = pd.to_datetime(df[["year", "month", "day", "hour"]])

#borramos los valores que ya fueron embebidos en datetime
df.drop(["year", "month", "day", "hour"], axis=1, inplace=True)

Como estamos manejando series de tiempo, usamos esta variable como indice de nuestra tabla.

In [30]:
df.set_index("datetime", inplace=True)

La variables de clima se dan por razon de dia, asi por lo que se repiten a lo largo del transucrso de las horas; vamos a eliminar los dator repetidos y quedarnos con un dato por dia; hacemos esto haciend un remuestreo de los datos por dia tomando el promedio de los datos de todo un dia; como los datos son los mismos por dia el promedio sera igual a los valores.

In [31]:
df_mean = df.resample("1D").mean()

In [32]:
df_mean.head()

Unnamed: 0_level_0,tmax-cab,tmax-hmo,tmax-obr,tmax-lmo,tmax-cul,tmin-cab,tmin-hmo,tmin-obr,tmin-lmo,tmin-cul,...,martes_postfestivo,semana_santa,1_mayo,10_mayo,16_sep,2_nov.,pre-navidad_y_new_year,navidad_y_new_year,post-navidad_y_new_year,demanda_energia
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-01-01,21.0,22.0,25.0,30.0,29.0,2.0,9.0,8.0,10.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1165.25
2007-01-02,21.0,22.0,22.0,22.0,27.0,2.0,7.0,7.0,11.0,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1335.416667
2007-01-03,22.0,22.0,23.0,25.0,27.0,6.0,6.0,10.0,11.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1452.125
2007-01-04,22.0,24.0,26.0,26.0,28.0,5.0,11.0,9.0,11.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1457.75
2007-01-05,24.0,21.0,25.0,28.0,29.0,6.0,10.0,9.0,10.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1458.208333


Tomamos los valores que pertenescan al clima.

In [33]:
resumen_clima = ['tmax-cab', 'tmax-hmo', 'tmax-obr',
       'tmax-lmo', 'tmax-cul', 'tmin-cab', 'tmin-hmo', 'tmin-obr', 'tmin-lmo',
       'tmin-cul', 'prec_hmo_mm', 'prec_obr_mm', 'prec_lmo_mm', 'prec_cul_mm']

df_clima = df[resumen_clima]

Sacamos los 10 componentes principales de los valores df_clima

In [40]:
datos_clima = df_clima.values

n_components = 10
pipe = Pipeline(
    [('scaler', StandardScaler()), 
     ('pca', PCA(n_components=n_components))]
)

pipe.fit(datos_clima)

pca = pipe.get_params()['pca']

pd.DataFrame(
    pca.components_.T, 
    columns=[f'PC {x}' for x in range(1,n_components+1)],
    index = df_clima.columns
)

Unnamed: 0,PC 1,PC 2,PC 3,PC 4,PC 5,PC 6,PC 7,PC 8,PC 9,PC 10
tmax-cab,-0.321841,-0.046594,0.006603,0.045792,0.016737,0.200631,-0.43781,-0.137434,-0.094097,-0.185862
tmax-hmo,-0.316232,-0.101055,0.017197,0.066787,0.02902,0.311966,-0.388686,-0.125074,0.251413,0.132946
tmax-obr,-0.31685,-0.12322,0.018398,0.038945,0.043874,0.292611,-0.065332,-0.232685,0.343085,-0.004761
tmax-lmo,-0.299939,-0.115873,-0.020361,0.019542,0.017605,0.326914,0.255679,0.764762,0.021471,-0.347949
tmax-cul,-0.29136,-0.162668,-0.094003,0.062216,0.032496,0.323904,0.636059,-0.326044,-0.328759,0.340538
tmin-cab,-0.319997,0.063115,-0.002421,-0.034962,-0.013788,-0.203996,-0.173196,-0.048118,-0.655119,-0.276863
tmin-hmo,-0.324657,0.032336,0.013136,-0.005087,-0.019756,-0.203018,-0.202347,0.266936,-0.216444,0.426344
tmin-obr,-0.324175,0.049822,0.007922,-0.04333,-0.008311,-0.356177,0.012053,-0.03664,0.010212,0.157784
tmin-lmo,-0.314163,0.063873,0.00089,-0.052479,-0.03518,-0.376339,0.129777,0.210534,0.395066,0.321482
tmin-cul,-0.31298,0.019248,-0.028922,-0.018333,0.00048,-0.362353,0.286795,-0.313174,0.259522,-0.569034


In [43]:
import numpy as np
import plotly.express as px

print("Varianza por componente: ", pca.explained_variance_)
print("Porcentaje de varianza explicada por componente: ", 
      pca.explained_variance_ratio_)

df_ev = pd.DataFrame({
    "Varianza explicada": np.cumsum(pca.explained_variance_ratio_),
    "Componente": [f'PC {x}' for x in range(1,n_components+1)]
})
df_ev

Varianza por componente:  [8.46210975 1.75037496 0.8720356  0.78443136 0.61389572 0.52296502
 0.28332862 0.21712031 0.15694906 0.09993889]
Porcentaje de varianza explicada por componente:  [0.60443192 0.12502585 0.06228779 0.0560304  0.04384937 0.03735437
 0.02023761 0.01550848 0.01121056 0.00713844]


Unnamed: 0,Varianza explicada,Componente
0,0.604432,PC 1
1,0.729458,PC 2
2,0.791746,PC 3
3,0.847776,PC 4
4,0.891625,PC 5
5,0.92898,PC 6
6,0.949217,PC 7
7,0.964726,PC 8
8,0.975936,PC 9
9,0.983075,PC 10


In [44]:
fig = px.line(df_ev, x='Componente', y='Varianza explicada', markers=True,
              title="Varianza explicada por componente")
fig.show()

Podemos visualizar que los primeros dos coponentes pueden explicar mas del 70% de la varianza, y los 10 componentes principales de los datos de clima pueden expicar alrededordel 98% de la varianza.