## Load Irradiance

From header, 
```
ALLSKY_KT              CERES SYN1deg All Sky Insolation Clearness Index (dimensionless) 
ALLSKY_SRF_ALB         CERES SYN1deg All Sky Surface Albedo (dimensionless) 
SZA                    CERES SYN1deg Solar Zenith Angle (Degrees) 
ALLSKY_SFC_PAR_TOT     CERES SYN1deg All Sky Surface PAR Total (W/m^2) 
CLRSKY_SFC_SW_DWN      CERES SYN1deg Clear Sky Surface Shortwave Downward Irradiance (Wh/m^2) 
```

Further Reading at [NASA Source](https://power.larc.nasa.gov/docs/methodology/energy-fluxes/derived-parameters/).

**PAR** : Photosynthetically Active Radiation (PAR)

In [104]:
import pandas as pd
import os
import numpy as np
from pathlib import Path
from datetime import datetime

import pvlib
from pvlib.location import Location

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV

In [105]:
# Load Irradiance
dataPath = Path('./data')
fileName = 'POWER_Point_Hourly.csv'
df = pd.read_csv(dataPath / fileName,  skiprows=13)
df['MO'] = df['MO'].astype(str).str.zfill(2)
df['DY'] = df['DY'].astype(str).str.zfill(2)
df['HR'] = df['HR'].astype(str).str.zfill(2)
df['date'] = df['YEAR'].astype(str) + ' ' + df['MO'] + ' ' + df['DY'] + ' ' + df['HR']
df['date'] = pd.to_datetime(df['date'], format='%Y %m %d %H')

# subset
df = df[['date', 'ALLSKY_KT', 'ALLSKY_SRF_ALB', 'SZA', 'ALLSKY_SFC_PAR_TOT', 'CLRSKY_SFC_SW_DWN']]

# Convert Datetime
df.tail()

# Y : CLRSKY_SFC_SW_DWN or ALLSKY_SFC_PAR_TOT
df_Y = df

# check visually
df_Y.head(2)

Unnamed: 0,date,ALLSKY_KT,ALLSKY_SRF_ALB,SZA,ALLSKY_SFC_PAR_TOT,CLRSKY_SFC_SW_DWN
0,2020-12-31 18:00:00,-999.0,-999.0,-999.0,0.0,0.0
1,2020-12-31 19:00:00,-999.0,-999.0,-999.0,0.0,0.0


## Load CFSv2 Data

In [106]:
# load / process
CFSv2_list = [dataPath / file for file in os.listdir('./data') if file.startswith('CFSV2')]
df_list = []
for fileLoc in CFSv2_list:
    df = pd.read_csv(fileLoc)
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d_%H_%M')
    colName = str(fileLoc).split('_')[1].split('.')[0]
    df = df.rename(columns = {'value' : colName})
    
    df_list.append(df)
    
    del df
    
# fuse dataframes
df_all = None
for df_loc in df_list:
    df_loc = df_loc.groupby('date', as_index=False).mean()
    if(df_all is None):
        df_all = df_loc
    else:
        df_all = pd.merge(left=df_all, right=df_loc, on='date', how='inner')

# rename
df_CFSv2 = df_all

# check visually
df_CFSv2.head(1)

Unnamed: 0,date,DownLong,UpwardLong,uWind,DownShort,vWind,TempHeight
0,2002-01-01 06:00:00,209.067383,266.0,2.625,92.774399,-1.025,259.974503


## Merge Frames & process

In [107]:
# pd.merge(left=df_all, right=df_loc, on='date', how='inner')
df_all = pd.merge(left=df_Y, right=df_CFSv2, on='date', how='inner')

# No NaNs !
existNaNs = False
for col in df_all.columns:
    existNaNs |= any(df_all[col].isna())
print('Exist NaNs in the dataset? : {}!'.format(existNaNs))
    
# last copy for PERSISTENCE
df = df_all
df_all = df_all.rename(columns = {'ALLSKY_SFC_PAR_TOT' : 'Y'})
df_ts  = df_all[['date', 'Y']]

# add next timestamp's variable as endogenous variable
df['Y'] = df['ALLSKY_SFC_PAR_TOT'].shift(-1)

# delete NaNs
df = df[df['Y'] != -999]

# drop duplicate columns
df = df.drop(labels=['ALLSKY_SRF_ALB'], axis=1)

# restrict to prediction of 12noon (i.e. 6am data rows)
df = df[df.date.dt.hour == 6]

# subset to X & y
X, y = df.drop(labels=['date', 'Y'], axis=1).to_numpy(), df['Y'].to_numpy()

Exist NaNs in the dataset? : False!


## Visualize Data

In [108]:
df.corr()

Unnamed: 0,ALLSKY_KT,SZA,ALLSKY_SFC_PAR_TOT,CLRSKY_SFC_SW_DWN,DownLong,UpwardLong,uWind,DownShort,vWind,TempHeight,Y
ALLSKY_KT,1.0,0.999926,0.59301,0.651586,0.556844,0.722641,-0.230022,0.562907,0.012401,0.718318,0.578983
SZA,0.999926,1.0,0.584798,0.642602,0.552942,0.718075,-0.229331,0.556482,0.012385,0.713559,0.574889
ALLSKY_SFC_PAR_TOT,0.59301,0.584798,1.0,0.923871,0.4309,0.660764,-0.158034,0.809486,-0.005181,0.66721,0.752973
CLRSKY_SFC_SW_DWN,0.651586,0.642602,0.923871,1.0,0.518015,0.685489,-0.188696,0.757588,-0.024012,0.697176,0.658489
DownLong,0.556844,0.552942,0.4309,0.518015,1.0,0.864228,-0.158355,0.272202,0.108401,0.877207,0.170434
UpwardLong,0.722641,0.718075,0.660764,0.685489,0.864228,1.0,-0.149516,0.597532,0.182598,0.992535,0.509659
uWind,-0.230022,-0.229331,-0.158034,-0.188696,-0.158355,-0.149516,1.0,-0.101052,0.227026,-0.176772,0.001208
DownShort,0.562907,0.556482,0.809486,0.757588,0.272202,0.597532,-0.101052,1.0,0.016821,0.569614,0.855818
vWind,0.012401,0.012385,-0.005181,-0.024012,0.108401,0.182598,0.227026,0.016821,1.0,0.203099,0.050824
TempHeight,0.718318,0.713559,0.66721,0.697176,0.877207,0.992535,-0.176772,0.569614,0.203099,1.0,0.480593


## Split Data: *Train*, *Val* and *Test*

In [109]:
# regression split (LASSO, ...)
seed_1, seed_2 = 5645, 8373
X_tr, X_, y_tr, y_     = train_test_split(X,  y,  train_size=280, random_state=seed_1) #  train: 280
X_te, X_va, y_te, y_va = train_test_split(X_, y_, train_size=60,  random_state=seed_2) # "test" : 60  -> val: 59

# time series split (...)
lTr   = datetime.strptime('2021-10-07 12:00:00', '%Y-%m-%d %H:%M:%S') # 280
lVa   = datetime.strptime('2021-12-06 12:00:00', '%Y-%m-%d %H:%M:%S') # 60
lTe   = datetime.strptime('2022-03-23 12:00:00', '%Y-%m-%d %H:%M:%S') # 59

## Get Clearsky Index

In [110]:
# location 
loc = Location(41.883, -87.641, 'US/Central', 597, 'Chicago')

# times 
times = pd.date_range(start='2021-01-01', end='2022-03-23', freq='1h', tz=loc.tz)
times = times[(times.hour == 6) | (times.hour == 12)]

# ghi
df_cSky = loc.get_clearsky(times) # ineichen with climatology table by default
df_cSky['date'] = df_cSky.index.tz_localize(None)
df_cSky = df_cSky.reset_index()
df_cSky = df_cSky[['date', 'ghi', 'dni', 'dhi']]

# add to ts
df_ts = pd.merge(left=df_ts, right=df_cSky, on='date', how='inner')

## Save

In [113]:
df_ts.to_csv('df_ts.csv', index=None)
df.to_csv('df.csv', index=None)