In [2]:
# Scenario Generation with Copulas 
# 
# Hugo S. de Araujo
# Nov. 14th, 2022 | Mays Group | Cornell University
################################################################################

#=======================================================================
PROJECT SETUP
=======================================================================#
using Pkg
Pkg.activate("copulas");
Pkg.instantiate();
# Import "here" function. Wrapper to allow easy path concatenation.
include(joinpath(@__DIR__, "functions", "fct_here.jl"))

# Import all required packages. 
begin
    # using AWSS3
    using CSV
    using DataFrames
    using Dates
    using DelimitedFiles
    using Distributions
    using HDF5
    using JuliaFormatter
    using LaTeXStrings
    using LinearAlgebra
    using LinearSolve
    #using Measures
    using Random
    using RCall
    using Revise
    using Statistics
    using StatsBase
    #using StatsPlots
    using OhMyREPL
    using Plots
    #using PrettyTables
    using Tables
    using TSFrames
    using TimeZones
end

# Include functions 
#= functions_dirpath = joinpath(pwd(),"src", "functions");
function_paths = readdir(functions_dirpath, join=true);
function_index = occursin.(".jl", function_paths);
functions_only = function_paths[function_index];

for str in functions_only
    include(str)
end =#

include(here("src", "functions", "fct_bind_historical_forecast.jl"));
include(here("src", "functions", "fct_compute_hourly_average_actuals.jl"));
include(here("src", "functions", "fct_compute_landing_probability.jl"));
include(here("src", "functions", "fct_convert_hours_2018.jl"));
include(here("src", "functions", "fct_convert_ISO_standard.jl"));
include(here("src", "functions", "fct_convert_land_prob_to_data.jl"));
include(here("src", "functions", "fct_generate_probability_scenarios.jl"));
include(here("src", "functions", "fct_getplots.jl"));
include(here("src", "functions", "fct_plot_correlation_heatmap.jl"));
include(here("src", "functions", "fct_plot_historical_landing.jl"));
include(here("src", "functions", "fct_plot_historical_synthetic_autocorrelation.jl"));
include(here("src", "functions", "fct_plot_correlogram_landing_probability.jl"));
include(here("src", "functions", "fct_plot_scenarios_and_actual.jl"));
include(here("src", "functions", "fct_read_h5_file.jl"));
include(here("src", "functions", "fct_read_input_file.jl"));
include(here("src", "functions", "fct_transform_landing_probability.jl"));
include(here("src", "functions", "fct_write_percentiles.jl"));
#=======================================================================
READ INPUT FILE
=======================================================================#
input_file_path = here("src\\copulas.txt")

# XXX Needs to be updated to be a hardcoded instead of reading in a text file
data_type,
scenario_length,
number_of_scenarios,
scenario_hour,
scenario_day,
scenario_month,
scenario_year,
read_locally,
historical_load,
forecast_load,
historical_solar,
forecast_da_solar,
forecast_2da_solar,
historical_wind,
forecastd_da_wind,
forecast_2da_wind,
write_percentile = read_input_file(input_file_path);

#=======================================================================
READ INPUT DATA: ARPA-E PERFORM PROJECT H5 FILES
=======================================================================#
# Function that reads the .h5 file and binds the time index and the actuals/fore-
# cast values into a single dataframe.

# Load data
load_actuals_raw = read_h5_file(here("data", historical_load), "load");
load_forecast_raw = read_h5_file(here("data", "ercot_BA_load_forecast_day_ahead_2018.h5"), "load", false);

# Solar data
solar_actuals_raw = read_h5_file(here("data", "ercot_BA_solar_actuals_Existing_2018.h5"), "solar");
solar_forecast_dayahead_raw = read_h5_file(here("data", "ercot_BA_solar_forecast_day_ahead_existing_2018.h5"), "solar", false);
solar_forecast_2dayahead_raw = read_h5_file(here("data", "ercot_BA_solar_forecast_2_day_ahead_existing_2018.h5"), "solar", false);

# Wind data
wind_actuals_raw = read_h5_file(here("data", "ercot_BA_wind_actuals_Existing_2018.h5"), "wind");
wind_forecast_dayahead_raw = read_h5_file(here("data", "ercot_BA_wind_forecast_day_ahead_existing_2018.h5"), "wind", false);
wind_forecast_2dayahead_raw = read_h5_file(here("data", "ercot_BA_wind_forecast_2_day_ahead_existing_2018.h5"), "wind", false);

#=======================================================================
Compute the hourly average for the actuals data
=======================================================================#
# Load
aux = compute_hourly_average_actuals(load_actuals_raw);
load_actual_avg_raw = DataFrame();
time_index = aux[:, :Index];
avg_actual = aux[:, :values_mean];
load_actual_avg_raw[!, :time_index] = time_index;
load_actual_avg_raw[!, :avg_actual] = avg_actual;

# Solar
aux = compute_hourly_average_actuals(solar_actuals_raw);
time_index = aux[:, :Index];
avg_actual = aux[:, :values_mean];
solar_actual_avg_raw = DataFrame();
solar_actual_avg_raw[!, :time_index] = time_index;
solar_actual_avg_raw[!, :avg_actual] = avg_actual;

# Wind
aux = compute_hourly_average_actuals(wind_actuals_raw);
time_index = aux[:, :Index];
avg_actual = aux[:, :values_mean];
wind_actual_avg_raw = DataFrame();
wind_actual_avg_raw[!, :time_index] = time_index;
wind_actual_avg_raw[!, :avg_actual] = avg_actual;

[32m[1m  Activating[22m[39m project at `c:\Users\ks885\Documents\aa_research\Modeling\norta_scenarios\copulas\src\copulas`


In [3]:
scenario_length

48

In [4]:
# create a function to do this for all data
function upd_convert_hours_2018(data, is_actual = true)
    if is_actual
        x = copy(data);
        x[:,:time_index] = x[:,:time_index] .- Dates.Hour(6);
        return x;
    else
        df = copy(data);
        df[:,:forecast_time] = df[:,:forecast_time] .- Dates.Hour(6);
        df[:,:issue_time] = df[:,:issue_time] .- Dates.Hour(6);
        return df;
    end
end

upd_convert_hours_2018 (generic function with 2 methods)

In [5]:
# Load data
load_actuals = upd_convert_hours_2018(load_actuals_raw);
load_actual_avg = upd_convert_hours_2018(load_actual_avg_raw);
load_forecast = upd_convert_hours_2018(load_forecast_raw, false);

# Solar data
solar_actuals = upd_convert_hours_2018(solar_actuals_raw);
solar_actual_avg = upd_convert_hours_2018(solar_actual_avg_raw);
solar_forecast_dayahead = upd_convert_hours_2018(solar_forecast_dayahead_raw, false);
solar_forecast_2dayahead = upd_convert_hours_2018(solar_forecast_2dayahead_raw, false);

# Wind data
wind_actuals = upd_convert_hours_2018(wind_actuals_raw);
wind_actual_avg = upd_convert_hours_2018(wind_actual_avg_raw);
wind_forecast_dayahead = upd_convert_hours_2018(wind_forecast_dayahead_raw, false);
wind_forecast_2dayahead = upd_convert_hours_2018(wind_forecast_2dayahead_raw, false);


In [6]:
#=======================================================================
BIND HOURLY HISTORICAL DATA WITH FORECAST DATA
========================================================================#
#= The binding is made by ("forecast_time" = "time_index"). This causes the 
average actual value to be duplicated, which is desired, given the # of rows
in the load_forecast is double that of load_actual. To distinguish a 
one-day-ahead forecast from a two-day-ahead forecast, the column "ahead_factor"
is introduced. Bind the day-ahead and two-day-ahead forecasts for wind and solar
to get all the forecast data into one object as it is for load forecast =#
    load_data = bind_historical_forecast(true,
    load_actual_avg,
    load_forecast);

solar_data = bind_historical_forecast(false,
    solar_actual_avg,
    solar_forecast_dayahead,
    solar_forecast_2dayahead);

wind_data = bind_historical_forecast(false,
    wind_actual_avg,
    wind_forecast_dayahead,
    wind_forecast_2dayahead);

In [7]:
#=======================================================================
Landing probability
=======================================================================#
#= This section holds the calculation of the probability that the actual
value was equaled or superior than the forecast percentiles for a given
day. This is made possible by the estimation of an approximate CDF
computed on the forecast percentiles. Once estimated, this function is
used to find the "landing probability"; the prob. that the actual value
is equal or greater than a % percentage of the forecast percentile.
=#
#include(here("src", "functions", "fct_compute_landing_probability.jl"))
landing_probability_load = compute_landing_probability(load_data);
landing_probability_solar = compute_landing_probability(solar_data);
landing_probability_wind = compute_landing_probability(wind_data);

In [8]:
#=======================================================================
ADJUST LANDING PROBABILITY DATAFRAME
=======================================================================#
lp_load = transform_landing_probability(landing_probability_load);
lp_solar = transform_landing_probability(landing_probability_solar);
lp_wind = transform_landing_probability(landing_probability_wind);

In [9]:
#=======================================================================
Determine length of Decision Problem
=======================================================================#
x = copy(wind_data);
# Sort data by issue time
sort!(x, :issue_time);
# Group data by issue time and count occurences in every group
df = combine(groupby(x, [:issue_time]), DataFrames.nrow => :count);
# Filter data by count. Only keep groups with 48 entries
df_filtered = filter(:count => ==(48), df);
issue_times_interest = df_filtered[!, :issue_time];
# find all forecast times for these issue times of interest
subset_wind_data = filter(row -> row[:issue_time] in issue_times_interest, wind_data);
subset_forecast_times = subset_wind_data[!, :forecast_time];
unique_forecast_times = unique(subset_forecast_times);
decision_T = length(unique_forecast_times);

unique_issue_times = unique(subset_wind_data[!, :issue_time]);

In [10]:
x = copy(lp_solar)
allequal_set = Set(findall(allequal, eachcol(x)));
allequal_ind = sort(collect(allequal_set));
allindex_set = Set(collect(1:48));
alldifferent_ind = sort(collect(setdiff(allindex_set, allequal_set))); # Index for columns whose st. dev. isn't zero
x = x[:, alldifferent_ind];

In [11]:
x

363×28 Matrix{Float64}:
 1.0  1.0  1.0       0.868687  0.747475   …  0.343434   0.232323   1.0  1.0
 1.0  1.0  0.0       0.0       0.0           0.171717   0.10101    1.0  1.0
 1.0  1.0  0.0       0.505051  0.515152      0.131313   0.0707071  1.0  1.0
 1.0  1.0  0.848485  0.868687  0.848485      0.232323   0.121212   1.0  1.0
 1.0  1.0  0.757576  0.656566  0.606061      0.272727   0.151515   1.0  1.0
 1.0  1.0  0.0       0.0       0.424242   …  0.505051   0.20202    1.0  1.0
 1.0  1.0  1.0       0.0       0.0           0.494949   0.212121   1.0  1.0
 1.0  1.0  0.575758  0.585859  0.494949      0.0909091  0.141414   1.0  1.0
 1.0  1.0  0.0       0.0       0.0           0.0808081  0.151515   1.0  1.0
 1.0  1.0  0.0       0.0       0.434343      0.363636   0.20202    1.0  1.0
 ⋮                                        ⋱             ⋮               
 1.0  1.0  0.0       0.636364  0.616162      0.222222   0.0909091  1.0  1.0
 1.0  1.0  0.0       0.0       0.0505051  …  0.272727   0.10101    

In [12]:
# ==================================================================
# COLUMNS TO KEEP
# ==================================================================
#=Here, we care only about the columns in x::DataFrame whose elements 
are not all equal. If they are, the correlation is not defined b/c
the standard deviation will be zero for columns whose elements
are all the same =#
if ishermitian(cor(x))
    Σ_Z = LinearAlgebra.cholesky(cor(x));
else
    Σ_Z = factorize(cor(x));
end
M = Σ_Z.L;
dim_M = size(M, 1);

# ==================================================================
# CORRELATION MATRIX
# ==================================================================
#= Determine a lower-triangular, nonsingular factorization M of the 
    the correlation matrix for Z such that MM' = Sigma_Z. =#
if ishermitian(cor(x))
    Σ_Z = LinearAlgebra.cholesky(cor(x));
else
    Σ_Z = factorize(cor(x));
end
M = Σ_Z.L;
dim_M = size(M, 1);


Random.seed!(12345);

In [13]:
scenario_length

48

In [14]:
left_half = lp_solar[:, 1:size(lp_load, 2) ÷ 2];
q_landing_probability = vec(left_half);


In [15]:
println(scenario_year)
println(scenario_month)
println(scenario_day)
println(scenario_hour)   

2018
8
29
14


In [16]:
scenario_hour = 

ErrorException: syntax: incomplete: premature end of input

In [17]:
start_date = DateTime(string(scenario_year) * "-" * string(scenario_month) * "-" * string(scenario_day) * "T" * string(scenario_hour));

start_index = findfirst(isequal(start_date), unique_forecast_times)


5775

In [18]:
# ==================================================================
# PROBABILITY GENERATION LOOP
# ==================================================================
#= In certain cases, as in solar power, not all columns will be 
useful. Some will be discarded to avoid problems in the factorization
of the correlation matrix. Here we check if the dimension n of the 
matrix M (n x n) is equal to the scenario length stipulated as 48.
If it is not, the vector W will have its length adjusted to n. 
The variable allequal_ind stores the indices of the columns that 
were discarded. After the scenarios Y are generated, Y column dimension
will be expanded and all-zero columns will be introduced in the 
location of the allequal_ind
=#
#Random.seed!(29031990)
Random.seed!(12345)
Y = Matrix{Float64}(undef, number_of_scenarios, scenario_length)

# need_expansion = 0 # This is specially important for solar data with several columns whose st. dev. is zero
# if dim_M != scenario_length
#     original_scen_length = scenario_length
#     upd_scenario_length = dim_M
#     Y = Matrix{Float64}(undef, number_of_scenarios, upd_scenario_length)
#     need_expansion = 1
# else
#     upd_scenario_length = scenario_length;
# end



1000×48 Matrix{Float64}:
   5.0e-324        0.0           …    5.0e-324        2.0e-323
   8.18978e-312    0.0                8.18978e-312    0.0
 NaN             NaN                NaN             NaN
   0.0             8.18757e-312       8.18506e-312    0.0
   0.0             6.36599e-314       1.4854e-313     0.0
   0.0             5.0e-324      …    5.4e-323        0.0
   0.0             0.0                0.0             0.0
 NaN             NaN                NaN             NaN
   8.18506e-312    8.18831e-312       8.18832e-312    8.18879e-312
   2.122e-314      1.4854e-313        6.36599e-314    1.4854e-313
   ⋮                             ⋱                  
   0.0             0.0                0.0             0.0
 NaN               8.18472e-312       8.18508e-312  NaN
   8.18506e-312    8.18506e-312       8.18506e-312    8.18506e-312
   2.122e-314      0.0                0.0             2.122e-314
   5.0e-324        5.4e-323      …    5.0e-324        5.4e-323
   8.18978e-312

Ok, so the trick is that the cholesky factorization gets rid of a lot of the excess stuff:

* how?

then

* if I need q_solar_probability, how do I avoid the 1s?

Also, 

* if the solar is only forecasting for 28 hours, ... how do i ensure that the forecast is for 48 hours but puts the 28 correct hours appropriately

In [19]:
for nscen in 1:number_of_scenarios
    # Set up normal distribution with mean 0 and sd equal to 1
    d = Normal(0,1);

    #Generate vector W = (W_1, ..., W_k) ~ iid standard normal
    W = rand(d, upd_scenario_length);

    # Create vector Z such that Z <- MW
    Z = M * W;

    #Compute the CDF of Z
    #cdf_Z = sort(cdf.(d, Z));
    cdf_Z = cdf.(d, Z);
    
    for k in 1:upd_scenario_length
        #Apply the inverse CDF for X_k
        # Y[nscen, k] = quantile(x[:, k], cdf_Z[k])
        Y[nscen, k] = quantile(cdf_Z, q_landing_probability[start_index]);  
        #= tells us the simulated quantile that we are at \
        from the simulation scenario probability distribution...
        =#
    end
end

UndefVarError: UndefVarError: `upd_scenario_length` not defined

we could define the 42-28=14 hours where there is for sure no solar output based on some process and manually change any 1 to 0

In [20]:
Y

1000×48 Matrix{Float64}:
   5.0e-324        0.0           …    5.0e-324        2.0e-323
   8.18978e-312    0.0                8.18978e-312    0.0
 NaN             NaN                NaN             NaN
   0.0             8.18757e-312       8.18506e-312    0.0
   0.0             6.36599e-314       1.4854e-313     0.0
   0.0             5.0e-324      …    5.4e-323        0.0
   0.0             0.0                0.0             0.0
 NaN             NaN                NaN             NaN
   8.18506e-312    8.18831e-312       8.18832e-312    8.18879e-312
   2.122e-314      1.4854e-313        6.36599e-314    1.4854e-313
   ⋮                             ⋱                  
   0.0             0.0                0.0             0.0
 NaN               8.18472e-312       8.18508e-312  NaN
   8.18506e-312    8.18506e-312       8.18506e-312    8.18506e-312
   2.122e-314      0.0                0.0             2.122e-314
   5.0e-324        5.4e-323      …    5.0e-324        5.4e-323
   8.18978e-312

A couple of notes
* The are all zero as I expected
* The diurnal pattern.... is that preserved? no i don't think so... let's print


In [21]:
# print out Y to a csv
CSV.write("011a_print_scenarios.csv", DataFrame(Y, :auto), writeheader=false)

"011a_print_scenarios.csv"

Ok, but if you don't comment out the dim size thing then Y is N by 28.

In [22]:
# ==================================================================
# PROBABILITY GENERATION LOOP
# ==================================================================
#= In certain cases, as in solar power, not all columns will be 
useful. Some will be discarded to avoid problems in the factorization
of the correlation matrix. Here we check if the dimension n of the 
matrix M (n x n) is equal to the scenario length stipulated as 48.
If it is not, the vector W will have its length adjusted to n. 
The variable allequal_ind stores the indices of the columns that 
were discarded. After the scenarios Y are generated, Y column dimension
will be expanded and all-zero columns will be introduced in the 
location of the allequal_ind
=#
#Random.seed!(29031990)
Random.seed!(12345)
Y = Matrix{Float64}(undef, number_of_scenarios, scenario_length)

need_expansion = 0 # This is specially important for solar data with several columns whose st. dev. is zero
if dim_M != scenario_length
    original_scen_length = scenario_length
    upd_scenario_length = dim_M
    Y = Matrix{Float64}(undef, number_of_scenarios, upd_scenario_length)
    need_expansion = 1
else
    upd_scenario_length = scenario_length;
end

1

In [23]:
Y

1000×28 Matrix{Float64}:
 0.0           0.0           0.0           …  8.1894e-312   1.49874e-314
 0.0           0.0           0.0              8.18941e-312  0.0
 0.0           0.0           0.0              8.18941e-312  5.30499e-313
 0.0           0.0           0.0              8.18941e-312  1.6e-322
 0.0           0.0           0.0              8.18941e-312  8.16968e-312
 0.0           0.0           0.0           …  8.1894e-312   1.6e-322
 0.0           0.0           0.0              8.1894e-312   0.0
 0.0           0.0           0.0              8.18941e-312  1.49874e-314
 0.0           0.0           0.0              8.18941e-312  0.0
 0.0           0.0           0.0              8.18941e-312  5.30499e-313
 ⋮                                         ⋱                
 0.0           0.0           0.0              3.6586e-320   8.18941e-312
 0.0           0.0           0.0              0.0           8.1894e-312
 0.0           0.0           8.18523e-312     1.49874e-314  8.18941e-312
 

In [24]:
for nscen in 1:number_of_scenarios
    # Set up normal distribution with mean 0 and sd equal to 1
    d = Normal(0,1);

    #Generate vector W = (W_1, ..., W_k) ~ iid standard normal
    W = rand(d, upd_scenario_length);

    # Create vector Z such that Z <- MW
    Z = M * W;

    #Compute the CDF of Z
    #cdf_Z = sort(cdf.(d, Z));
    cdf_Z = cdf.(d, Z);
    
    for k in 1:upd_scenario_length
        #Apply the inverse CDF for X_k
        # Y[nscen, k] = quantile(x[:, k], cdf_Z[k])
        Y[nscen, k] = quantile(cdf_Z, q_landing_probability[start_index]);  
        #= tells us the simulated quantile that we are at \
        from the simulation scenario probability distribution...
        =#
    end
end

In [25]:
Y

1000×28 Matrix{Float64}:
 0.187397  0.187397  0.187397  0.187397  …  0.187397  0.187397  0.187397
 0.260757  0.260757  0.260757  0.260757     0.260757  0.260757  0.260757
 0.467003  0.467003  0.467003  0.467003     0.467003  0.467003  0.467003
 0.247095  0.247095  0.247095  0.247095     0.247095  0.247095  0.247095
 0.221442  0.221442  0.221442  0.221442     0.221442  0.221442  0.221442
 0.423861  0.423861  0.423861  0.423861  …  0.423861  0.423861  0.423861
 0.357687  0.357687  0.357687  0.357687     0.357687  0.357687  0.357687
 0.337133  0.337133  0.337133  0.337133     0.337133  0.337133  0.337133
 0.267158  0.267158  0.267158  0.267158     0.267158  0.267158  0.267158
 0.323514  0.323514  0.323514  0.323514     0.323514  0.323514  0.323514
 ⋮                                       ⋱  ⋮                   
 0.157193  0.157193  0.157193  0.157193     0.157193  0.157193  0.157193
 0.364469  0.364469  0.364469  0.364469     0.364469  0.364469  0.364469
 0.539027  0.539027  0.539027  0.5

But all of the scenarios are still identical... Is this the case if i were doing wind or load instead? Am I still misunderstanding the quantile function?