In [1]:
using Pkg
Pkg.activate("C:\\Users\\ks885\\Documents\\aa_research\\Modeling\\norta_scenarios")
Pkg.instantiate()

[32m[1m  Activating[22m[39m project at `C:\Users\ks885\Documents\aa_research\Modeling\norta_scenarios`




In [2]:
using DrWatson
@quickactivate("norta_scenarios")

# Import all required packages. 
begin
    using CSV
    using DataFrames
    using Dates
    using DelimitedFiles
    using Distributions
    using HDF5
    using LaTeXStrings
    using LinearAlgebra
    using LinearSolve
    using Random
    using Statistics
    using StatsBase
    using Plots
    using Tables
    using TSFrames
    using TimeZones
end

# Include functions 
include(projectdir("src", "fct_bind_historical_forecast.jl"));
include(projectdir("src", "fct_compute_hourly_average_actuals.jl"));
include(projectdir("src", "fct_compute_landing_probability.jl"));
include(projectdir("src", "fct_convert_hours_2018.jl"));
include(projectdir("src", "fct_convert_ISO_standard.jl"));
include(projectdir("src", "fct_convert_land_prob_to_data.jl"));
include(projectdir("src", "fct_generate_probability_scenarios.jl"));
#include(projectdir("src", "fct_getplots.jl"));
#include(projectdir("src", "fct_plot_correlation_heatmap.jl"));
include(projectdir("src", "fct_plot_historical_landing.jl"));
include(projectdir("src", "fct_plot_historical_synthetic_autocorrelation.jl"));
include(projectdir("src", "fct_plot_correlogram_landing_probability.jl"));
include(projectdir("src", "fct_plot_scenarios_and_actual.jl"));
include(projectdir("src", "fct_read_h5_file.jl"));
include(projectdir("src", "fct_read_input_file.jl"));
include(projectdir("src", "fct_transform_landing_probability.jl"));
include(projectdir("src", "fct_write_percentiles.jl"));




[32m[1m  Activating[22m[39m project at `c:\Users\ks885\Documents\aa_research\Modeling\norta_scenarios\copulas`


ErrorException: The activated project did not match asserted name. Current project name is copulas while the asserted name is norta_scenarios.

In [3]:
#=======================================================================
READ INPUT FILE
=======================================================================#
input_file_path = projectdir("copulas.txt")

# XXX Needs to be updated to be a hardcoded instead of reading in a text file
data_type,
scenario_length,
number_of_scenarios,
scenario_hour,
scenario_day,
scenario_month,
scenario_year,
read_locally,
historical_load,
forecast_load,
historical_solar,
forecast_da_solar,
forecast_2da_solar,
historical_wind,
forecastd_da_wind,
forecast_2da_wind,
write_percentile = read_input_file(input_file_path);

UndefVarError: UndefVarError: `read_input_file` not defined

In [4]:
scenario_length

UndefVarError: UndefVarError: `scenario_length` not defined

In [5]:

#=======================================================================
READ INPUT DATA: ARPA-E PERFORM PROJECT H5 FILES
=======================================================================#
# Function that reads the .h5 file and binds the time index and the actuals/fore-
# cast values into a single dataframe.

# Load data
load_actuals = read_h5_file(datadir("exp_raw", historical_load), "load");
load_forecast = read_h5_file(datadir("exp_raw", "ercot_BA_load_forecast_day_ahead_2018.h5"), "load", false);

# Solar data
solar_actuals = read_h5_file(datadir("exp_raw", "ercot_BA_solar_actuals_Existing_2018.h5"), "solar");
solar_forecast_dayahead = read_h5_file(datadir("exp_raw", "ercot_BA_solar_forecast_day_ahead_existing_2018.h5"), "solar", false);
solar_forecast_2dayahead = read_h5_file(datadir("exp_raw", "ercot_BA_solar_forecast_2_day_ahead_existing_2018.h5"), "solar", false);

# Wind data
wind_actuals = read_h5_file(datadir("exp_raw","ercot_BA_wind_actuals_Existing_2018.h5"), "wind");
wind_forecast_dayahead = read_h5_file(datadir("exp_raw", "ercot_BA_wind_forecast_day_ahead_existing_2018.h5"), "wind", false);
wind_forecast_2dayahead = read_h5_file(datadir("exp_raw", "ercot_BA_wind_forecast_2_day_ahead_existing_2018.h5"), "wind", false);

#=======================================================================
Compute the hourly average for the actuals data
=======================================================================#
# Load
aux = compute_hourly_average_actuals(load_actuals);
load_actual_avg = DataFrame();
time_index = aux[:, :Index];
avg_actual = aux[:, :values_mean];
load_actual_avg[!, :time_index] = time_index;
load_actual_avg[!, :avg_actual] = avg_actual;

# Solar
aux = compute_hourly_average_actuals(solar_actuals);
time_index = aux[:, :Index];
avg_actual = aux[:, :values_mean];
solar_actual_avg = DataFrame();
solar_actual_avg[!, :time_index] = time_index;
solar_actual_avg[!, :avg_actual] = avg_actual;

# Wind
aux = compute_hourly_average_actuals(wind_actuals);
time_index = aux[:, :Index];
avg_actual = aux[:, :values_mean];
wind_actual_avg = DataFrame();
wind_actual_avg[!, :time_index] = time_index;
wind_actual_avg[!, :avg_actual] = avg_actual;

#=======================================================================
ADJUST THE TIME 
=======================================================================#
#= For the year of 2018, adjust the time to Texas' UTC (UTC-6 or UTC-5)
depending on daylight saving time =#

# Load data
load_actuals = convert_hours_2018(load_actuals);
load_actual_avg = convert_hours_2018(load_actual_avg);
load_forecast = convert_hours_2018(load_forecast, false);

# Solar data
solar_actuals = convert_hours_2018(solar_actuals);
solar_actual_avg = convert_hours_2018(solar_actual_avg);
solar_forecast_dayahead = convert_hours_2018(solar_forecast_dayahead, false);
solar_forecast_2dayahead = convert_hours_2018(solar_forecast_2dayahead, false);

# Wind data
wind_actuals = convert_hours_2018(wind_actuals);
wind_actual_avg = convert_hours_2018(wind_actual_avg);
wind_forecast_dayahead = convert_hours_2018(wind_forecast_dayahead, false);
wind_forecast_2dayahead = convert_hours_2018(wind_forecast_2dayahead, false);

UndefVarError: UndefVarError: `historical_load` not defined

In [6]:
#=======================================================================
BIND HOURLY HISTORICAL DATA WITH FORECAST DATA
========================================================================#
#= The binding is made by ("forecast_time" = "time_index"). This causes the 
average actual value to be duplicated, which is desired, given the # of rows
in the load_forecast is double that of load_actual. To distinguish a 
one-day-ahead forecast from a two-day-ahead forecast, the column "ahead_factor"
is introduced. Bind the day-ahead and two-day-ahead forecasts for wind and solar
to get all the forecast data into one object as it is for load forecast =#
    load_data = bind_historical_forecast(true,
    load_actual_avg,
    load_forecast);

solar_data = bind_historical_forecast(false,
    solar_actual_avg,
    solar_forecast_dayahead,
    solar_forecast_2dayahead);

wind_data = bind_historical_forecast(false,
    wind_actual_avg,
    wind_forecast_dayahead,
    wind_forecast_2dayahead);

UndefVarError: UndefVarError: `bind_historical_forecast` not defined

In [7]:
wind_data

UndefVarError: UndefVarError: `wind_data` not defined

In [8]:
#=======================================================================
Write forecast percentile to files 
=======================================================================#
#write_percentile(load_data, "load", scenario_year, scenario_month, scenario_day, scenario_hour);
write_percentile = true
if write_percentile
    write_percentiles(load_data, "load", scenario_year, scenario_month, scenario_day, scenario_hour, 48)
    write_percentiles(solar_data, "solar", scenario_year, scenario_month, scenario_day, scenario_hour, 48)
    write_percentiles(wind_data, "wind", scenario_year, scenario_month, scenario_day, scenario_hour, 48)
end

UndefVarError: UndefVarError: `write_percentiles` not defined

In [9]:
#=======================================================================
Landing probability
=======================================================================#
#= This section holds the calculation of the probability that the actual
value was equaled or superior than the forecast percentiles for a given
day. This is made possible by the estimation of an approximate CDF
computed on the forecast percentiles. Once estimated, this function is
used to find the "landing probability"; the prob. that the actual value
is equal or greater than a % percentage of the forecast percentile.
=#
#include(here("src", "functions", "fct_compute_landing_probability.jl"))
landing_probability_load = compute_landing_probability(load_data);
landing_probability_solar = compute_landing_probability(solar_data);
landing_probability_wind = compute_landing_probability(wind_data);

UndefVarError: UndefVarError: `compute_landing_probability` not defined

In [10]:
data = load_data
percentile_column_index = startswith.(names(data), "p_");
landing_probability = Vector{Float64}(undef, size(data, 1));

# The BaseStats.ecdf() function applied to the forecasts returns a 
# function. The returned function is then saved as empirical_cdf 
# and used to calculate the empirical CDF of the historical 
# averaged actual data.
#mm = Matrix{}(undef, size(data,1), 4);
for i in range(1, size(data, 1))
    quantiles = collect(data[i, percentile_column_index]);
    empirical_cdf = ecdf(quantiles);
    landing_probability[i] = empirical_cdf(data[i, :avg_actual]);
    # -----
    #= Storing min and max of adjust empirical_cdf function
    mm[i,3] = empirical_cdf(0); #Min. value emp. CDF can compute
    mm[i,4] = empirical_cdf(1); #Max. value emp. CDF can compute
    =#
end

# -----
#= Adding forecast and issue times to mm
println(pwd())
mm[:,1] = data[!, :forecast_time];
mm[:,2] = data[!, :issue_time];
writedlm(joinpath(pwd(),"output","max_min_ecdf.txt"), mm, ";")=#

# Create a DataFrame with the forecast time, issue time, ahead factor
# and the landing probability.
landing_probability = DataFrame(landing_probability=landing_probability)
landing_probability[!, :forecast_time] = data[!, :forecast_time]
landing_probability[!, :issue_time] = data[!, :issue_time]
landing_probability[!, :ahead_factor] = data[:, :ahead_factor]
select!(landing_probability, [:issue_time, :forecast_time, :landing_probability, :ahead_factor])

UndefVarError: UndefVarError: `load_data` not defined

In [11]:
#=======================================================================
ADJUST LANDING PROBABILITY DATAFRAME
=======================================================================#
#= Analysis to address point J.Mays raised on Slack on Dec. 29,2022.
Sort the landing_probability dataframe by issue time. Then group the 
dataset by issue_time and count how many observations exist per 
issue_time. We're only interested in keeping the forecasts that share
the same issue_time 48 times since 48 is the length for the generation=#
lp_load = transform_landing_probability(landing_probability_load);
lp_solar = transform_landing_probability(landing_probability_solar);
lp_wind = transform_landing_probability(landing_probability_wind);

UndefVarError: UndefVarError: `transform_landing_probability` not defined

In [12]:
lp_load

UndefVarError: UndefVarError: `lp_load` not defined

* Does the ordering into a data process matrix double count the two-day ahead? 
    * no right? because it's by issue time...

In [13]:
data = landing_probability_load
x = copy(data);
# Sort data by issue time
sort!(x, :issue_time);

# Group data by issue time and count occurences in every group
df = combine(groupby(x, [:issue_time]), DataFrames.nrow => :count);

# Filter data by count. Only keep groups with 48 entries
df_filtered = filter(:count => ==(48), df);

issue_times_interest = df_filtered[!, :issue_time];
landing_probability_filtered = innerjoin(x, df_filtered, on=:issue_time);
landing_probability_filtered_matrix = reshape(landing_probability_filtered[!, :landing_probability], (48, size(df_filtered, 1)));
landing_probability_filtered_matrix = transpose(landing_probability_filtered_matrix)

UndefVarError: UndefVarError: `landing_probability_load` not defined

In [14]:
lp_load

UndefVarError: UndefVarError: `lp_load` not defined

In [15]:
data = lp_load
scenario_length = 48
number_of_scenarios = 1000

x = copy(data)
allequal_set = Set(findall(allequal, eachcol(x)))
allequal_ind = sort(collect(allequal_set));
allindex_set = Set(collect(1:48));
alldifferent_ind = sort(collect(setdiff(allindex_set, allequal_set))); # Index for columns whose st. dev. isn't zero
x = x[:, alldifferent_ind]


UndefVarError: UndefVarError: `lp_load` not defined

In [16]:
if ishermitian(cor(x))
    Σ_Z = LinearAlgebra.cholesky(cor(x));
else
    Σ_Z = factorize(cor(x));
end
M = Σ_Z.L;
dim_M = size(M, 1);

UndefVarError: UndefVarError: `cor` not defined

In [17]:
    #Random.seed!(29031990)
    Random.seed!(12345)
    Y = Matrix{Float64}(undef, number_of_scenarios, scenario_length)

    need_expansion = 0 # This is specially important for solar data with several columns whose st. dev. is zero
    if dim_M != scenario_length
        original_scen_length = scenario_length
        scenario_length = dim_M
        Y = Matrix{Float64}(undef, number_of_scenarios, scenario_length)
        need_expansion = 1
    end

UndefVarError: UndefVarError: `Random` not defined

In [18]:
nscen = 1

# Set up normal distribution with mean 0 and sd equal to 1
d = Normal(0,1)



UndefVarError: UndefVarError: `Normal` not defined

In [19]:
#Generate vector W = (W_1, ..., W_k) ~ iid standard normal
W = rand(d, scenario_length)



UndefVarError: UndefVarError: `d` not defined

* does scenario_length need to be changed here? 
    - no, i don't think so because you need to multiply it by a the lower triangular autocorrelation

In [20]:
# Create vector Z such that Z <- MW
Z = M * W

UndefVarError: UndefVarError: `M` not defined

In [21]:
cdf_Z = cdf.(d, Z)


UndefVarError: UndefVarError: `cdf` not defined

In [22]:
k = 1 # each idx of scenario length

Y[nscen, k] = quantile(x[:, k], cdf_Z[k])

UndefVarError: UndefVarError: `x` not defined

* Is this incorrect? the documentation on quantile says 
    - quantile(d::UnivariateDistribution, q::Real)
    * the column of first indices in x is the first hour in every issue... how is that supposed to be related to a simulation for just the first hour in our lookahead?
    

In [23]:
x[:,k]

UndefVarError: UndefVarError: `x` not defined

In [24]:
cdf_Z[k]

UndefVarError: UndefVarError: `cdf_Z` not defined

* what should be the distribution?
* what should be the real number? - shouldn't it be x[1,k] for k = 1:scenario_length

In [25]:
test_scen = quantile(cdf_Z, x[1,k])

UndefVarError: UndefVarError: `cdf_Z` not defined

* this seems correct.... 

So is it here that the scenario_length object needs to differ from the scenario_length as it has been defined previously? 

scenario length has previously been assumed to be 48, and that is how the base process matrix of landing probabilities is generated.

However, we want to create a scenario of different length (based on some process i have yet to do). 


Update 2/6/2024

* we need to draw from the random distribution created by the NORTA process, with respect to the current here&now actual point (I believe the quantile probability and not the actual data)
* so I believe I was correct to say that the univariate distribution we use is the cdf_Z, and the point estimate we use is $q_t(x_{t,i},s^{avg}_t)$

In code this is:


first, need to take landing probability vector and put it into something that can be called easily later:

In [26]:
q = landing_probability_load[:,:landing_probability]

UndefVarError: UndefVarError: `landing_probability_load` not defined

Oh, but you need the one that is related to the current time that you are in... so we have to keep track of the index location of where we are in the forecast with respect to the issue time!

* how do we do that? 
    * we already have k, that tells us where in the scenario we are... but when k goes beyond the period we stay in a issue date forecast, how do we make sure we leave that?
    * there are a number of times that we must keep track of:
        * issue time of current issue
        * issue time of next issue
        * current rolling horizon index... which determines lookahead
        * and when generating, where we are in the lookahead - 
            k in 1:scenario_length

In [27]:
# Set the date and time for the forecasts
start_date = DateTime(string(scenario_year) * "-" * string(scenario_month) * "-" * string(scenario_day) * "T" * string(scenario_hour));
start_date

UndefVarError: UndefVarError: `scenario_year` not defined

Lets try to use just the solar forecast for the times, NOTE that I would need to check that the times are correlated correctly with the load and wind data

In [28]:
times_2day = solar_forecast_2dayahead[!, [:forecast_time, :issue_time]]
unique_2dayahead = unique(times_2day[!,:issue_time])

times_1day = solar_forecast_dayahead[!, [:forecast_time, :issue_time]]
unique_1dayahead = unique(times_1day[!,:issue_time])

unique_1dayahead == unique_2dayahead

UndefVarError: UndefVarError: `solar_forecast_2dayahead` not defined

as I suspected, there are not the same unique issue times in day ahead to 2 day ahead

* --- check that solar load and wind are correlated later

* are the forecast times the same?

In [29]:
forecast_times_1day = times_1day[!,:forecast_time]
forecast_times_2day = times_2day[!,:forecast_time]

forecast_times_1day == forecast_times_2day

UndefVarError: UndefVarError: `times_1day` not defined

- yes they are the same... good.

In [30]:
wind_times = wind_data[!, [:forecast_time, :issue_time]]
unique_wind = unique(wind_data[!,:issue_time])

solar_times = solar_data[!, [:forecast_time, :issue_time]]
unique_solar = unique(solar_times[!,:issue_time])

load_times = load_data[!, [:forecast_time, :issue_time]]
unique_load = unique(load_times[!,:issue_time])

unique_wind == unique_load

UndefVarError: UndefVarError: `wind_data` not defined

In [31]:
times = wind_times
hour_index = findall(x -> x == start_date, times[!,:forecast_time])[1]
current_issue = times[hour_index, :issue_time]
unique_issue_times = unique(times[!,:issue_time])


UndefVarError: UndefVarError: `wind_times` not defined

This is only going to get 24 elements. Doing the active times will get less than 24 elements. 

In [32]:
start_date > next_issue

UndefVarError: UndefVarError: `start_date` not defined

In [33]:
start_date

UndefVarError: UndefVarError: `start_date` not defined

In [34]:
next_issue

UndefVarError: UndefVarError: `next_issue` not defined

In [35]:
next_issue

UndefVarError: UndefVarError: `next_issue` not defined

In [36]:
all_times = forecast_times[all_indices]
# filter for the times that are after the current time, (greater than the current hour)
active_times = all_times[all_times .>= start_date]

UndefVarError: UndefVarError: `forecast_times` not defined

active_times is the length of how long this scenario generation can go....
* but how does the code know when to switch?
    * can it switch automatically when index _____ goes past the current issue.... so then it is in the next issue.... then the if statement can actually just describe the current_issue as the next issue...
        * does that iteratively solve/work?
* also, how do we know what the scenario length is then? 
    * is the scenario length just this + 24? 



If the start datetime is at the next issue datetime, then the next scenario generation must begin

In [37]:
scen = quantile(cdf_Z, )

UndefVarError: UndefVarError: `quantile` not defined