In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os

import joblib
import torch

import numpy as np
import pandas as pd
import xgboost as xgb

from scipy.stats import pearsonr

from IPython.display import Image

import plotly.graph_objects as go
from plotly.subplots import make_subplots

from utils import scale_data
from utils import preprocess_AUT_data
from utils import set_seeds

from utils import bayes_filter
from utils import l2norm_km
from utils import print_metrics

from utils import train_teleport
from utils import test_models

In [None]:
from constants import SEED

from constants import NUM_ESTIMATORS

In [None]:
SENSOR_COLS = ['Short-wave irradiation', 'YTD']
TARGET = ['Temperature']
X_WEST = 120
X_EAST = 160
DF_SOLAR = 'df_solar.csv'
FN_KRIGED_SOLAR = 'assets/solar_maps.npy'
FN_MODELS_H_SOLAR = 'assets/solar_models_H.pkl'
FN_MODEL_TELEPORT_SOLAR = 'assets/solar_models_teleport.pkl'

# Prepare Data

In [None]:
set_seeds()

In [None]:
if not os.path.exists(DF_SOLAR) or not os.path.exists(FN_KRIGED_SOLAR):
    df, k_maps = preprocess_AUT_data('postal_code', SENSOR_COLS, TARGET, ['postal_code', 'datetime'])
    df.to_csv(DF_SOLAR, index=False)
    joblib.dump(k_maps, FN_KRIGED_SOLAR)

df = pd.read_csv(DF_SOLAR)
df, scaler = scale_data(df, SENSOR_COLS + TARGET)

k_maps = joblib.load(FN_KRIGED_SOLAR)

In [None]:
tele_coord = []

for post_code, tmp_df in df.groupby(['postal_code']):
    rx = int(tmp_df['real_x'].iloc[0])
    ry = int(tmp_df['real_y'].iloc[0])
    tele_coord.append((rx, ry, post_code))

for i in [-1, 1]:
    for coord in tele_coord:
        tmp_df = pd.DataFrame()
        for k in k_maps.keys():
            tmp_df[k] = k_maps[k][:, coord[0], coord[1]]
        tmp_df['real_x'] = coord[0] + i
        tmp_df['real_y'] = coord[1]
        tmp_df['postal_code'] = str(coord[2]) + '_' + str(i)
        tmp_df['YTD'] = df.loc[df['postal_code'] == df['postal_code'].unique()[0], 'YTD']

        df = pd.concat([df, tmp_df], ignore_index=True)

In [None]:
df = df.reset_index().drop(columns='index')

n_examples_data = len(df)
val_examples_data = np.random.choice(df.index, int(n_examples_data * 0.3), replace=False)
df_val_data = df.loc[df['postal_code'].isin(df.iloc[val_examples_data]['postal_code'].unique())]
df_train_data = df.drop(val_examples_data)

n_examples_station = len(df['postal_code'].unique())
val_examples_station = np.random.choice(df['postal_code'].unique(), int(n_examples_station * 0.3), replace=False)
df_val_station = df.loc[(df['postal_code'].isin(val_examples_station))]
df_train_station = df.loc[(~df['postal_code'].isin(val_examples_station))]

df_joined = df.query(f'real_x < {X_WEST} or real_x > {X_EAST}')
n_examples_west = len(df_joined.query(f'real_x < {X_WEST}')['postal_code'].unique())
n_examples_east = len(df_joined.query(f'real_x > {X_EAST}')['postal_code'].unique())
val_examples_west = np.random.choice(df_joined.query(f'real_x < {X_WEST}')['postal_code'].unique(), int(n_examples_west * 0.3), replace=False)
val_examples_east = np.random.choice(df_joined.query(f'real_x > {X_EAST}')['postal_code'].unique(), int(n_examples_east * 0.3), replace=False)

In [None]:
print(f'Total Stations: {n_examples_station:22}')
print(f'Station Split Validation Stations: {len(val_examples_station):3}')
print(f'Station Split Traning Stations: {n_examples_station-len(val_examples_station):6}')

In [None]:
df['ORIGINAL_Short-wave irradiation'].describe()

# Training
## Train Model H

In [None]:
valval = [v for v in df_val_data['postal_code'].unique() if type(v) == float]
df_val_data = df_val_data.query(f'postal_code in {valval}')
df_val_station = df_val_data.query(f'postal_code in {valval}')

In [None]:
if not os.path.exists(FN_MODELS_H_SOLAR):
    models_H = {}
    for n in ['data_split', 'station_split']:
        if n == 'data_split':
            X = df_train_data[SENSOR_COLS]
            y = df_train_data[TARGET]
        elif n == 'station_split':
            X = df_train_station[SENSOR_COLS]
            y = df_train_station[TARGET]

        reg = xgb.XGBRegressor(n_estimators=NUM_ESTIMATORS, random_state=SEED, n_jobs=8)
        reg.fit(X, y)

        models_H[n] = reg
    joblib.dump(models_H, FN_MODELS_H_SOLAR)
else:
    models_H = joblib.load(FN_MODELS_H_SOLAR)

In [None]:
dist = {}

for n in models_H:
    if n == 'data_split':
        df_val_data_examples = df_val_data['postal_code'].unique()
        df_val = df.loc[df['postal_code'].isin(df_val_data_examples)]
    elif n == 'station_split':
        df_val = df_val_station
    
    dist[n] = print_metrics(df_val, 'postal_code', models_H[n], SENSOR_COLS, TARGET, k_maps, scaler, 'A')

## Train Teleport Models

In [None]:
df_train_west = df_joined.loc[~df_joined['postal_code'].isin(val_examples_west)].query(f'real_x < {X_WEST}')
df_train_east = df_joined.loc[~df_joined['postal_code'].isin(val_examples_east)].query(f'real_x > {X_EAST}')
df_val_west = df_joined.loc[df_joined['postal_code'].isin(val_examples_west)]
df_val_east = df_joined.loc[df_joined['postal_code'].isin(val_examples_east)]

In [None]:
if not os.path.exists(FN_MODEL_TELEPORT_SOLAR):
    models_T = train_teleport(
        df_train_west[SENSOR_COLS].to_numpy(),
        df_train_east[SENSOR_COLS].to_numpy(),
        df_val_west[SENSOR_COLS].to_numpy(),
        df_val_east[SENSOR_COLS].to_numpy()
    )
    joblib.dump(models_T, FN_MODEL_TELEPORT_SOLAR)
else:
    models_T = joblib.load(FN_MODEL_TELEPORT_SOLAR)
    
enc_a = models_T['enc_a']
enc_b = models_T['enc_b']
lat = models_T['lat']
dec_a = models_T['dec_a']
dec_b = models_T['dec_b']

# Test

In [None]:
orig_a, tele_a, orig_b, tele_b, rse_tele_a, rse_tele_b, mae_ae = test_models(
    'postal_code', df_val_west, df_val_east, SENSOR_COLS, 
#    models_H['station_split'], models_T,
    models_H['data_split'], models_T,
    k_maps['Temperature'], scaler, 'A'
)

In [None]:
mae_ae = np.array(mae_ae)
print(f'AutoEncoder MAE: {mae_ae.mean()}')

In [None]:
distance_ew = l2norm_km((X_WEST, 0), (X_EAST, 0), 'A')
print(f'Distance East-West: {distance_ew}')

# Plots

In [None]:
to_plot = [8063, 4794]

df_val = pd.concat([df_val_west, df_val_east]).reset_index().drop(columns=['index'])
for i in to_plot:
    if i not in df_val['postal_code'].unique():
        df_val = pd.concat([df_val, df.query(f'postal_code == {i}')]).reset_index().drop(columns=['index'])

## Scatter Plot Real vs Predicted Values (Model H)

In [None]:
orig = []
pred = []

for _, tmp_df in df_val.groupby(['postal_code']):
    real_scaled = scaler.inverse_transform(tmp_df[TARGET].to_numpy(), TARGET)
    orig.extend(real_scaled.ravel())

    w_prime = models_H['data_split'].predict(tmp_df[SENSOR_COLS])
    w_prime_scaled = scaler.inverse_transform(w_prime, TARGET)
    pred.extend(w_prime_scaled.ravel())
    
fig = go.Figure()
fig.add_trace(go.Scatter(x=orig, y=np.array(pred).ravel(), mode='markers', name='Prediction'))

fig.add_trace(go.Scatter(x=[260, 300], y=[260, 300], mode='lines'))
fig.update_layout(showlegend=False)

fig.update_layout(autosize=False, width=500, height=500, font={'size': 24}, template='simple_white')
fig.update_layout(margin={'l': 0,'r': 0, 'b': 0,'t': 0})
fig.update_layout(xaxis_title='Real Temperature (K)', yaxis_title='Predicted Temperature (K)')
img = fig.to_image(format="png")
with open('plots/solar_weather_scatter.png', 'wb') as f:
    f.write(img)
Image(img)

##  Localisation over Time

In [None]:
df_results = pd.DataFrame()
results = []
durations = [7, 14, 31, 90, 180, 365]

for postal_code, tmp_df in df_val.groupby(['postal_code']):
    real_x = int(tmp_df['real_x'].iloc[0])
    real_y = int(tmp_df['real_y'].iloc[0])

    for duration in durations:
        for run in range(20):
            if duration != 365:
                max_time = k_maps['Temperature'].shape[0]
                start = np.random.randint(0, max_time - duration)
            else:
                if duration == 365 and run == 0:
                    start = 0
                else:
                    break

            w_prime = models_H['data_split'].predict(tmp_df[SENSOR_COLS])
            w_prime_range = w_prime[start:start+duration]
            x, y = bayes_filter(w_prime_range, k_maps['Temperature'], start, duration)
            dist = l2norm_km((real_x, real_y), (x, y), 'A')
            res = {'postal_code': postal_code, 'dist': dist, 'pred_x': x, 'pred_y': y,
                   'real_x': real_x, 'real_y': real_y, 'run': run, 'duration': duration, 'start': start}
            results.append(res)

In [None]:
df_results = pd.DataFrame(results)

fig = go.Figure()
for duration in durations:
    fig.add_trace(go.Box(y=df_results.loc[df_results['duration']==duration, 'dist'], name=duration))
fig.update_layout(showlegend=False, template='simple_white')
fig.update_layout(xaxis_title='Days', font={'size': 24}, margin={'l': 0,'r': 0, 'b': 0,'t': 0})
fig.update_yaxes(title_text='Error (km)', range=[0, 80])
fig.show()

fig.write_image('plots/solar_localization.pdf')

## Sample Traces and Condidence/Error over Time

In [None]:
fig_temp = go.Figure()
fig_sens = go.Figure()
fig_dist_conf = make_subplots(specs=[[{"secondary_y": True}]])

log_tickvals = np.array([1, 5, 10, 20, 50, 100, 200])
idx = 1

for ref_station in df_val['postal_code'].unique():
    if ref_station in to_plot:
        tmp_df = df_val.loc[df_val['postal_code'] == ref_station]

        fig_temp.add_trace(go.Scatter(y=scaler.inverse_transform(tmp_df['Temperature'].to_numpy(), ['Temperature']).ravel()))
        fig_temp.update_layout(template='simple_white', showlegend=False, width=1000, height=500)
        fig_temp.update_layout(margin={'l': 0,'r': 0, 'b': 0,'t': 0}, font={'size': 24})
        fig_temp.update_layout(yaxis_title='Temperature (K)', xaxis_title='Days')
                             
        fig_sens.add_trace(go.Scatter(y=scaler.inverse_transform(tmp_df['Short-wave irradiation'].to_numpy(), ['Short-wave irradiation']).ravel()))
        fig_sens.update_layout(template='simple_white', showlegend=False, width=1000, height=500)
        fig_sens.update_layout(margin={'l': 0,'r': 0, 'b': 0,'t': 0}, font={'size': 24})
        fig_sens.update_layout(yaxis_title='Short-wave irradiation (Wh/m²)', xaxis_title='Days')
        
        real_x = int(tmp_df['real_x'].unique().item())
        real_y = int(tmp_df['real_y'].unique().item())
        x, y, post = bayes_filter(tmp_df['Temperature'].to_numpy(), k_maps['Temperature'], 0, 365, posteriors=True)
        post = np.array(post)
        fig_dist_conf.add_trace(go.Scatter(y=post[:, real_x, real_y], name='Confidence ' +str(idx)), secondary_y=True)
        
        distances = []
        for i in range(len(post)):
            x, y = np.unravel_index(post[i].argmax(), post[i].shape)
            distances.append(l2norm_km((real_x, real_y), (x, y), 'A'))
        fig_dist_conf.add_trace(go.Scatter(y=distances, name='Error ' + str(idx)), secondary_y=False)
        fig_dist_conf.update_layout(template='simple_white', margin={'l': 0,'r': 0, 'b': 0,'t': 0})
        fig_dist_conf.update_layout(xaxis_title='Days', template='simple_white', margin={'l': 0,'r': 0, 'b': 0,'t': 0}, font={'size': 24})
        fig_dist_conf.update_layout(width=1000, height=500)
        fig_dist_conf.update_yaxes(title_text='Confidence', secondary_y=True)
        fig_dist_conf.update_yaxes(title_text='Error (km)', secondary_y=False, type='log', tickvals=log_tickvals)
        idx += 1

fig_dist_conf.update_layout(legend={'orientation': 'h', 'yanchor': 'bottom', 'y': 1.02, 'xanchor': 'right', 'x': 1})
        
fig_dist_conf['data'][0]['marker'] = {'color': 'orange'}
fig_dist_conf['data'][0]['line'] = {'width': 3}
fig_dist_conf['data'][1]['marker'] = {'color': 'orange'}
fig_dist_conf['data'][1]['line'] = {'dash': 'dot', 'width': 4}
fig_dist_conf['data'][2]['marker'] = {'color': '#1f77b4'}
fig_dist_conf['data'][2]['line'] = {'width': 3}
fig_dist_conf['data'][3]['marker'] = {'color': '#1f77b4'}
fig_dist_conf['data'][3]['line'] = {'dash': 'dash', 'width': 4}


fig_temp.show()
fig_sens.show()
fig_dist_conf.show()

fig_temp.write_image('plots/solar_h_temp_sample.pdf')
fig_sens.write_image('plots/solar_h_sens_sample.pdf')
fig_dist_conf.write_image('plots/solar_h_dist_conf.pdf')

## Original vs Teleported

In [None]:
tmp_df = df_val.loc[df_val['postal_code'] == 8232]
S_ab = torch.Tensor(tmp_df[SENSOR_COLS].to_numpy())
with torch.no_grad():
    s_tele_a = dec_b(lat(enc_a(S_ab)))
S_ab = S_ab.numpy()
s_tele_a = s_tele_a.numpy()

fig = go.Figure()
fig.add_trace(go.Scatter(y=scaler.inverse_transform(S_ab[:, 0], [SENSOR_COLS[0]]).squeeze(), name='Original', line=dict(width=3)))
fig.add_trace(go.Scatter(y=scaler.inverse_transform(s_tele_a[:, 0], [SENSOR_COLS[0]]).squeeze(), name='Teleported'))
fig.update_layout(template='simple_white', margin={'l': 0,'r': 0, 'b': 0,'t': 0})
fig.update_layout(font={'size': 24}, xaxis_title='Days', yaxis_title='Solar Radiation (Wh/m²)')
fig.update_layout(legend={'orientation': 'h', 'yanchor': 'bottom', 'y': 1.02, 'xanchor': 'right', 'x': 1})
fig.show()
fig.write_image('plots/solar_teleport_sample.pdf')

In [None]:
pearsonr(S_ab[:, 0], s_tele_a[:, 0])

## Distance from Teleported

In [None]:
stations_west = df_val_west['postal_code'].unique()
stations_east = df_val_east['postal_code'].unique()
station_distances = []

for i in range(len(stations_west)):
    tmp_df = df.loc[df['postal_code'] == stations_west[i]]
    x_A = tmp_df['real_x'].iloc[0]
    y_A = tmp_df['real_y'].iloc[0]
    for i in range(len(stations_east)):
        tmp_df = df.loc[df['postal_code'] == stations_east[i]]
        x_B = tmp_df['real_x'].iloc[0]
        y_B = tmp_df['real_y'].iloc[0]
        
        station_distances.append(l2norm_km((x_A, y_A), (x_B, y_B), 'A'))

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(y=station_distances, name='Station Distances'))
fig.add_trace(go.Box(y=orig_a, name='Original Trace West'))
fig.add_trace(go.Box(y=tele_a, name='Teleported Trace West'))
fig.add_trace(go.Box(y=orig_b, name='Original Trace East'))
fig.add_trace(go.Box(y=tele_b, name='Teleported Trace East'))
fig.update_layout(showlegend=False, template='simple_white', margin={'l': 0,'r': 0, 'b': 0,'t': 0})
fig.update_layout(font={'size': 24}, yaxis_title='Error (km)')
fig.show()

fig.write_image('plots/solar_teleport_localization.pdf')

In [None]:
avg_err = (np.abs(np.array(orig_a) - np.array(tele_a)).mean() + np.abs(np.array(orig_b) - np.array(tele_b)).mean())/2
print(f'Average Error : {avg_err} km')

In [None]:
err_a = np.abs(np.array(orig_a) - np.array(tele_a)).mean()
err_b = np.abs(np.array(orig_b) - np.array(tele_b)).mean()
print(f'Error A: {err_a}km, Error B: {err_b} km')

In [None]:
rel_error = ((np.array(tele_a) - np.array(orig_a)) / np.abs(np.array(tele_a))).mean() * 100
print(f'Relative Error A: {rel_error}%')
rel_error = ((np.array(tele_b) - np.array(orig_b)) / np.abs(np.array(tele_b))).mean() * 100
print(f'Relative Error B: {rel_error}%')

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(y=np.array(rse_tele_a), name='Teleported Trace East'))
fig.add_trace(go.Box(y=np.array(rse_tele_b), name='Teleported Trace West'))
fig.update_layout(showlegend=False, template='simple_white', margin={'l': 0,'r': 0, 'b': 0,'t': 0})
fig.update_layout(font={'size': 24}, yaxis_title='Mean Absolute Error')
fig.update_yaxes(range=[0, 0.12])
fig.show()

fig.write_image('plots/solar_teleport_mae.pdf')