In [6]:
import os
import pickle
import sys
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import r2_score
from tensorflow import keras
from tensorflow.keras import layers
from tqdm.auto import tqdm, trange

tqdm.pandas()

from collections import defaultdict


os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
tf.config.list_physical_devices('GPU')
local_root_path = "."
sys.path.append(local_root_path)
import annutils
import importlib
importlib.reload(annutils)


<module 'annutils' from 'd:\\projects\\delta_salinity\\scripts\\rma_ann_repo\\annutils.py'>

Basic Setup

In [7]:
compression_opts = dict(method='zip', archive_name='out.csv')


# Make a dir named Experiments
if not os.path.exists("Experiments"):
    os.mkdir("Experiments")

num_sheets = 9

observed_stations_ordered_by_median = ['RSMKL008', 'RSAN032', 'RSAN037', 'RSAC092', 'SLTRM004', 'ROLD024',
                                       'CHVCT000', 'RSAN018', 'CHSWP003', 'CHDMC006', 'SLDUT007', 'RSAN072',
                                       'OLD_MID', 'RSAN058', 'ROLD059', 'RSAN007', 'RSAC081', 'SLMZU025',
                                       'RSAC075', 'SLMZU011', 'SLSUS012', 'SLCBN002', 'RSAC064']

output_stations = ['CHDMC006-CVP INTAKE', 'CHSWP003-CCFB_INTAKE', 'CHVCT000-VICTORIA INTAKE',
                   'OLD_MID-OLD RIVER NEAR MIDDLE RIVER', 'ROLD024-OLD RIVER AT BACON ISLAND',
                   'ROLD059-OLD RIVER AT TRACY BLVD', 'RSAC064-SACRAMENTO R AT PORT CHICAGO',
                   'RSAC075-MALLARDISLAND', 'RSAC081-COLLINSVILLE', 'RSAC092-EMMATON',
                   'RSAC101-SACRAMENTO R AT RIO VISTA', 'RSAN007-ANTIOCH', 'RSAN018-JERSEYPOINT',
                   'RSAN032-SACRAMENTO R AT SAN ANDREAS LANDING', 'RSAN037-SAN JOAQUIN R AT PRISONERS POINT',
                   'RSAN058-ROUGH AND READY ISLAND', 'RSAN072-SAN JOAQUIN R AT BRANDT BRIDGE',
                   'RSMKL008-S FORK MOKELUMNE AT TERMINOUS', 'SLCBN002-CHADBOURNE SLOUGH NR SUNRISE DUCK CLUB',
                   'SLDUT007-DUTCH SLOUGH', 'SLMZU011-MONTEZUMA SL AT BELDONS LANDING',
                   'SLMZU025-MONTEZUMA SL AT NATIONAL STEEL', 'SLSUS012-SUISUN SL NEAR VOLANTI SL',
                   'SLTRM004-THREE MILE SLOUGH NR SAN JOAQUIN R', 'SSS-STEAMBOAT SL', 'CCW-MIDDLE RIVER INTAKE',
                   'OH4-OLD R @ HWY 4', 'SLRCK005-CCWD_Rock', 'MRU-MIDDLE RIVER AT UNDINE ROAD', 'HLL-HOLLAND TRACT',
                   'BET-PIPER SLOUGH @ BETHEL TRACT', 'GES-SACRAMENTO R BELOW GEORGIANA SLOUGH',
                   'NMR: N FORK MOKELUMNE R NEAR WALNUT GROVE', 'IBS-CORDELIA SLOUGH @ IBIS CLUB',
                   'GYS-GOODYEAR SLOUGH AT MORROW ISLAND CLUB', 'BKS-SLBAR002-North Bay Aqueduct/Barker Sl']

output_stations, name_mapping = annutils.read_output_stations(output_stations, observed_stations_ordered_by_median)

### Experiment:6 Years
This is just the base data for the 6 selected years.
 If we do this right the training data should be approx 6 * 365 =~ 2190 rows of training data


In [3]:
# Make a dir named 6years
experiment_name = "6years"
if not os.path.exists("Experiments/" + experiment_name):
    os.mkdir("Experiments/" + experiment_name)

picked_training_years = [
    ('2007-10-1','2008-9-30'),
    ('2008-10-1','2009-9-30'),
    ('2010-10-1','2011-9-30'),
    ('2011-10-1','2012-9-30'),
    ('2013-10-1','2014-9-30'),
    ('2016-10-1','2017-9-30')
]

input_files = ["dsm2_ann_inputs_base.xlsx"]

X_df= None
Y_df= None

for data_file in input_files:
    data_path = os.path.join(local_root_path,data_file)
    dfinps, dfouts = annutils.read_and_split(data_path, num_sheets, observed_stations_ordered_by_median)
    X_df = pd.concat([X_df, dfinps], axis=0)
    Y_df = pd.concat([Y_df, dfouts], axis=0)

# now X_df should have 8 input features and Y_df should have 23 target salinity values






In [4]:
ndays=118
window_size=0
nwindows=0
df_plus = annutils.create_antecedent_inputs(X_df,ndays=ndays,window_size=window_size,nwindows=nwindows)
# df_plus should now have 118 * 8 = 944 input features

# synchronize trims off the na values so the row numbers go from 10920 to 10803
df_X2, df_Y2 = annutils.synchronize(df_plus, Y_df)

train_X = annutils.include(df_X2, picked_training_years)
train_Y = annutils.include(df_Y2, picked_training_years)

test_X = annutils.exclude(df_X2, picked_training_years)
test_Y = annutils.exclude(df_Y2, picked_training_years)

train_X.to_csv(os.path.join("Experiments", experiment_name, "train_X.csv"), compression=compression_opts)
train_Y.to_csv(os.path.join("Experiments", experiment_name, "train_Y.csv"), compression=compression_opts)
test_X.to_csv(os.path.join("Experiments", experiment_name, "test_X.csv"), compression=compression_opts)
test_Y.to_csv(os.path.join("Experiments", experiment_name, "test_Y.csv"), compression=compression_opts)



# Expirement: 4 years

This is the same as 6 years but with only 4 years in the data 

In [5]:
# Make a dir named 4years
# experiment_names = ["4years","4years_DCC","4years_SacLag","4years_SacMag"]
experiment_names = ["4years_DCC"]

picked_training_years = [
    ('2007-10-1','2008-9-30'),
    ('2009-10-1','2010-9-30'),
    ('2011-10-1','2012-9-30'),
    ('2013-10-1','2014-9-30')
]

input_files_dict = {"4years":["dsm2_ann_inputs_base.xlsx"],
                    "4years_DCC":["dsm2_ann_inputs_base.xlsx",
                                  "dsm2_ann_inputs_dcc0.xlsx",
                                  "dsm2_ann_inputs_dcc1.xlsx"],
                                #   "dsm2_ann_inputs_rsacminus15day.xlsx",
                                #   "dsm2_ann_inputs_rsacminus20pct.xlsx"],
                    "4years_SacLag":["dsm2_ann_inputs_base.xlsx",
                                     "dsm2_ann_inputs_rsacplus15day.xlsx",
                                     "dsm2_ann_inputs_rsanminus15day.xlsx"],
                    "4years_SacMag":["dsm2_ann_inputs_base.xlsx",
                                     "dsm2_ann_inputs_rsacplus20pct.xlsx",
                                     "dsm2_ann_inputs_rsanminus20pct.xlsx"]
}

ndays=118
window_size=0
nwindows=0

for experiment_name in experiment_names:
    if not os.path.exists("Experiments/" + experiment_name):
        os.mkdir("Experiments/" + experiment_name)

    X_df= None
    Y_df= None

    input_files = input_files_dict[experiment_name]

    for data_file in tqdm(input_files):
        data_path = os.path.join(local_root_path,data_file)
        dfinps, dfouts = annutils.read_and_split(data_path, num_sheets, observed_stations_ordered_by_median)
        dfinps = annutils.create_antecedent_inputs(dfinps,ndays=ndays,window_size=window_size,nwindows=nwindows)
        dfinps, dfouts = annutils.synchronize(dfinps, dfouts)
        X_df = pd.concat([X_df, dfinps], axis=0)
        Y_df = pd.concat([Y_df, dfouts], axis=0)

    # now X_df should have 118 * 8 = 944 input features and Y_df should have 23 target salinity values


    train_X = annutils.include(X_df, picked_training_years)
    train_Y = annutils.include(Y_df, picked_training_years)

    test_X = annutils.exclude(X_df, picked_training_years)
    test_Y = annutils.exclude(Y_df, picked_training_years)

    train_X.to_csv(os.path.join("Experiments", experiment_name, "train_X.csv"), compression=compression_opts)
    train_Y.to_csv(os.path.join("Experiments", experiment_name, "train_Y.csv"), compression=compression_opts)
    test_X.to_csv(os.path.join("Experiments", experiment_name, "test_X.csv"), compression=compression_opts)
    test_Y.to_csv(os.path.join("Experiments", experiment_name, "test_Y.csv"), compression=compression_opts)
    
    print(f"Finished compiling inputs for {experiment_name} experiment")

100%|██████████| 3/3 [00:02<00:00,  1.02it/s]


Finished compiling inputs for 4years_DCC experiment


# Expirement: 4 years (calendar years) 

This is the same as 6 years but with only 4 years in the data and using jan 1 - dec 31 instead of water year limits

In [6]:
# Make a dir named 4years_cal

picked_training_years = [
    ('2008-1-1','2008-12-31'),
    ('2010-1-1','2010-12-31'),
    ('2012-1-1','2012-12-31'),
    ('2014-1-1','2014-12-31')
]

input_files = ["dsm2_ann_inputs_base.xlsx"]

experiment = '4years_cal'

ndays=118
window_size=0
nwindows=0

if not os.path.exists("Experiments/" + experiment_name):
    os.mkdir("Experiments/" + experiment_name)

X_df= None
Y_df= None

for data_file in tqdm(input_files):
    data_path = os.path.join(local_root_path,data_file)
    dfinps, dfouts = annutils.read_and_split(data_path, num_sheets, observed_stations_ordered_by_median)
    dfinps = annutils.create_antecedent_inputs(dfinps,ndays=ndays,window_size=window_size,nwindows=nwindows)
    dfinps, dfouts = annutils.synchronize(dfinps, dfouts)
    X_df = pd.concat([X_df, dfinps], axis=0)
    Y_df = pd.concat([Y_df, dfouts], axis=0)

# now X_df should have 118 * 8 = 944 input features and Y_df should have 23 target salinity values

train_X = annutils.include(X_df, picked_training_years)
train_Y = annutils.include(Y_df, picked_training_years)

test_X = annutils.exclude(X_df, picked_training_years)
test_Y = annutils.exclude(Y_df, picked_training_years)

train_X.to_csv(os.path.join("Experiments", experiment, "train_X.csv"), compression=compression_opts)
train_Y.to_csv(os.path.join("Experiments", experiment, "train_Y.csv"), compression=compression_opts)
test_X.to_csv(os.path.join("Experiments", experiment, "test_X.csv"), compression=compression_opts)
test_Y.to_csv(os.path.join("Experiments", experiment, "test_Y.csv"), compression=compression_opts)

print(f"Finished compiling inputs for {experiment} experiment")

100%|██████████| 1/1 [00:00<00:00,  1.65it/s]


Finished compiling inputs for 4years_cal experiment


# Expirement: 4 years (calendar years) plus augmented data

This is the same as 6 years but with only 4 years in the data and using jan 1 - dec 31 instead of water year limits

In [7]:
# Make a dir named 4years_cal

picked_training_years = [
    ('2008-1-1','2008-12-31'),
    ('2010-1-1','2010-12-31'),
    ('2012-1-1','2012-12-31'),
    ('2014-1-1','2014-12-31')
]

aug_data = [('2014-1-1','2014-12-31')]

input_files = ["dsm2_ann_inputs_base.xlsx"]
aug_input_files = [r"D:\projects\delta_salinity\model\dsm2\DSP_DSM2_202307\modified_bc\anninputs\hiexplonf2\dsm2_ann_inputs_hiexplonf2.xlsx"]

experiment = '4years_cal_hiexplonf2'
dcc_sub_part_f = 'DSP_HIEXPLONF2_202308'
dcc_part_f = 'DWR-DMS-DSM2'

ndays=118
window_size=0
nwindows=0

if not os.path.exists("Experiments/" + experiment):
    os.mkdir("Experiments/" + experiment)

X_df= None
Y_df= None

for data_file in tqdm(input_files):
    data_path = os.path.join(local_root_path,data_file)
    dfinps, dfouts = annutils.read_and_split(data_path, num_sheets, observed_stations_ordered_by_median)
    dfinps = annutils.create_antecedent_inputs(dfinps,ndays=ndays,window_size=window_size,nwindows=nwindows)
    dfinps, dfouts = annutils.synchronize(dfinps, dfouts)
    X_df = pd.concat([X_df, dfinps], axis=0)
    Y_df = pd.concat([Y_df, dfouts], axis=0)

# now X_df should have 118 * 8 = 944 input features and Y_df should have 23 target salinity values

train_X = annutils.include(X_df, picked_training_years)
train_Y = annutils.include(Y_df, picked_training_years)

test_X = annutils.exclude(X_df, picked_training_years)
test_Y = annutils.exclude(Y_df, picked_training_years)

# Add augmented data
X_df= None
Y_df= None

for data_file in tqdm(aug_input_files):
    data_path = data_file
    dfinps, dfouts = annutils.read_and_split(data_path, num_sheets, observed_stations_ordered_by_median)
    dfinps = annutils.create_antecedent_inputs(dfinps,ndays=ndays,window_size=window_size,nwindows=nwindows)
    dfinps, dfouts = annutils.synchronize(dfinps, dfouts)
    X_df = pd.concat([X_df, dfinps], axis=0)
    Y_df = pd.concat([Y_df, dfouts], axis=0)

x_aug_train = annutils.include(X_df, aug_data)
y_aug_train = annutils.include(Y_df, aug_data)

# needs to combine the DCC gate operation DSS codes into one column.
x_aug_train.columns = [s.replace(dcc_sub_part_f, dcc_part_f) for s in x_aug_train.columns]
y_aug_train.columns = [s.replace(dcc_sub_part_f, dcc_part_f) for s in y_aug_train.columns]
x_aug_train.columns = [s.replace('01JAN2013 - 01JAN2014', '01JAN1953 - 01JAN2020') for s in x_aug_train.columns]
y_aug_train.columns = [s.replace('01JAN2013 - 01JAN2014', '01JAN1953 - 01JAN2020') for s in y_aug_train.columns]
x_aug_train.columns = [s.replace('1DAY', 'IR-YEAR') for s in x_aug_train.columns]
y_aug_train.columns = [s.replace('1DAY', 'IR-YEAR') for s in y_aug_train.columns]

print(f'non-overlapping columns: {len(set(train_X.columns) ^ set(x_aug_train.columns))}')

train_X = pd.concat([train_X, x_aug_train], axis=0)
train_Y = pd.concat([train_Y, y_aug_train], axis=0)


train_X.to_csv(os.path.join("Experiments", experiment, "train_X.csv"), compression=compression_opts)
train_Y.to_csv(os.path.join("Experiments", experiment, "train_Y.csv"), compression=compression_opts)
test_X.to_csv(os.path.join("Experiments", experiment, "test_X.csv"), compression=compression_opts)
test_Y.to_csv(os.path.join("Experiments", experiment, "test_Y.csv"), compression=compression_opts)

print(f"Finished compiling inputs for {experiment} experiment")

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  1.80it/s]
100%|██████████| 1/1 [00:00<00:00,  4.94it/s]


non-overlapping columns: 0
Finished compiling inputs for 4years_cal_hiexplonf2 experiment


# Expirement: 4 years (calendar years) but perturbed

This is the same as 6 years but with only 4 years in the data and using jan 1 - dec 31 instead of water year limits
But the input data to DSM2 has been perturbed from the baseline using perturbhist

In [4]:
# Make a dir named 4years_perturbhist

aug_data = [('2008-1-1','2008-12-31'),
    ('2010-1-1','2010-12-31'),
    ('2012-1-1','2012-12-31'),
    ('2014-1-1','2014-12-31')]

aug_input_files = [r"D:\projects\delta_salinity\model\dsm2\DSP_DSM2_202307\modified_bc\anninputs\perturbhist\dsm2_ann_inputs_perturbhist.xlsx"]

test_files = ["dsm2_ann_inputs_base.xlsx"]
test_data = aug_data

experiment = '4years_perturbhist'
dcc_sub_part_f = 'DSP_PERTURBHIST_202308'
dcc_part_f = 'DWR-DMS-DSM2'

ndays=118
window_size=0
nwindows=0

if not os.path.exists("Experiments/" + experiment):
    os.mkdir("Experiments/" + experiment)

# Add augmented data
X_df= None
Y_df= None

for data_file in tqdm(aug_input_files):
    data_path = data_file
    dfinps, dfouts = annutils.read_and_split(data_path, num_sheets, observed_stations_ordered_by_median)
    dfinps = annutils.create_antecedent_inputs(dfinps,ndays=ndays,window_size=window_size,nwindows=nwindows)
    dfinps, dfouts = annutils.synchronize(dfinps, dfouts)
    X_df = pd.concat([X_df, dfinps], axis=0)
    Y_df = pd.concat([Y_df, dfouts], axis=0)

train_X = annutils.include(X_df, aug_data)
train_Y = annutils.include(Y_df, aug_data)

# Add test data
X_df= None
Y_df= None

for data_file in tqdm(test_files):
    data_path = data_file
    dfinps, dfouts = annutils.read_and_split(data_path, num_sheets, observed_stations_ordered_by_median)
    dfinps = annutils.create_antecedent_inputs(dfinps,ndays=ndays,window_size=window_size,nwindows=nwindows)
    dfinps, dfouts = annutils.synchronize(dfinps, dfouts)
    X_df = pd.concat([X_df, dfinps], axis=0)
    Y_df = pd.concat([Y_df, dfouts], axis=0)

test_X = annutils.include(X_df, test_data)
test_Y = annutils.include(Y_df, test_data)

# needs to combine the DCC gate operation DSS codes into one column.
train_X.columns = [s.replace(dcc_sub_part_f, dcc_part_f) for s in train_X.columns]
train_Y.columns = [s.replace(dcc_sub_part_f, dcc_part_f) for s in train_Y.columns]
train_X.columns = [s.replace('01JAN2007 - 01JAN2014', '01JAN1953 - 01JAN2020') for s in train_X.columns]
train_Y.columns = [s.replace('01JAN2007 - 01JAN2014', '01JAN1953 - 01JAN2020') for s in train_Y.columns]
train_X.columns = [s.replace('1DAY', 'IR-YEAR') for s in train_X.columns]
train_Y.columns = [s.replace('1DAY', 'IR-YEAR') for s in train_Y.columns]

print(f'non-overlapping columns: {len(set(train_X.columns) ^ set(test_X.columns))}')

train_X.to_csv(os.path.join("Experiments", experiment, "train_X.csv"), compression=compression_opts)
train_Y.to_csv(os.path.join("Experiments", experiment, "train_Y.csv"), compression=compression_opts)
test_X.to_csv(os.path.join("Experiments", experiment, "test_X.csv"), compression=compression_opts)
test_Y.to_csv(os.path.join("Experiments", experiment, "test_Y.csv"), compression=compression_opts)

print(f"Finished compiling inputs for {experiment} experiment")

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  4.82it/s]
100%|██████████| 1/1 [00:00<00:00,  1.41it/s]


non-overlapping columns: 0
Finished compiling inputs for 4years_perturbhist experiment


NameError: name 'x_aug_train' is not defined

# Expirement: 4 years plus augmented data

This is the same as 6 years but with only 4 years in the data and the augmented inputs/outputs from DSM2

In [1]:
# Make a dir named 4years
# experiment_names = ["4years","4years_DCC","4years_SacLag","4years_SacMag"]
experiment_names = ["4years_hiexplonf"]

picked_training_years = [
    ('2007-10-1','2008-9-30'),
    ('2009-10-1','2010-9-30'),
    ('2011-10-1','2012-9-30'),
    ('2013-10-1','2014-9-30')
]

aug_data = [('2013-11-1','2014-6-1')]

input_files = ["dsm2_ann_inputs_base.xlsx"]
aug_input_files = [r"D:\projects\delta_salinity\model\dsm2\DSP_DSM2_202307\modified_bc\anninputs\hiexplonf\dsm2_ann_inputs_hiexplonf.xlsx"]

ndays=118
window_size=0
nwindows=0

experiment_name = '4years_hiexplonf'
    
if not os.path.exists("Experiments/" + experiment_name):
    os.mkdir("Experiments/" + experiment_name)

X_df= None
Y_df= None

for data_file in tqdm(input_files):
    data_path = os.path.join(local_root_path,data_file)
    dfinps, dfouts = annutils.read_and_split(data_path, num_sheets, observed_stations_ordered_by_median)
    dfinps = annutils.create_antecedent_inputs(dfinps,ndays=ndays,window_size=window_size,nwindows=nwindows)
    dfinps, dfouts = annutils.synchronize(dfinps, dfouts)
    X_df = pd.concat([X_df, dfinps], axis=0)
    Y_df = pd.concat([Y_df, dfouts], axis=0)

train_X = annutils.include(X_df, picked_training_years)
train_Y = annutils.include(Y_df, picked_training_years)

test_X = annutils.exclude(X_df, picked_training_years)
test_Y = annutils.exclude(Y_df, picked_training_years)

# Add augmented data
X_df= None
Y_df= None

for data_file in tqdm(aug_input_files):
    data_path = data_file
    dfinps, dfouts = annutils.read_and_split(data_path, num_sheets, observed_stations_ordered_by_median)
    dfinps = annutils.create_antecedent_inputs(dfinps,ndays=ndays,window_size=window_size,nwindows=nwindows)
    dfinps, dfouts = annutils.synchronize(dfinps, dfouts)
    X_df = pd.concat([X_df, dfinps], axis=0)
    Y_df = pd.concat([Y_df, dfouts], axis=0)

x_aug_train = annutils.include(X_df, aug_data)
y_aug_train = annutils.include(Y_df, aug_data)

train_X = pd.concat([train_X, x_aug_train], axis=0)
train_Y = pd.concat([train_Y, y_aug_train], axis=0)

train_X.to_csv(os.path.join("Experiments", experiment_name, "train_X.csv"), compression=compression_opts)
train_Y.to_csv(os.path.join("Experiments", experiment_name, "train_Y.csv"), compression=compression_opts)
test_X.to_csv(os.path.join("Experiments", experiment_name, "test_X.csv"), compression=compression_opts)
test_Y.to_csv(os.path.join("Experiments", experiment_name, "test_Y.csv"), compression=compression_opts)

print(f"Finished compiling inputs for {experiment_name} experiment")

NameError: name 'os' is not defined

# Experiment: Perturbed Latinhypercube

This has 7 years using 2008, 2010, 2012, 2014 with variations of perturbed data

In [10]:
# Make a dir named 4years_perturbhist
experiment = 'latinhypercube'

test_files = ["dsm2_ann_inputs_base.xlsx"]
test_data = [
    ('2007-10-1','2008-9-30'),
    ('2009-10-1','2010-9-30'),
    ('2011-10-1','2012-9-30'),
    ('2013-10-1','2014-9-30')
]
numbers = {1: 2014,
            2: 2010,
            3: 2008,
            4: 2014,
            5: 2012,
            6: 2010,
            7: 2008
}
ndays=118
window_size=0
nwindows=0

if not os.path.exists("Experiments/" + experiment):
    os.mkdir("Experiments/" + experiment)

X_df= None
Y_df= None

dcc_part_f = 'DWR-DMS-DSM2'

for case_num, yearran in numbers.items():

    aug_data = [(f'{str(yearran)}-1-1',f'{str(yearran)}-12-31')]

    aug_input_file = fr"D:\projects\delta_salinity\model\dsm2\DSP_DSM2_202307\latinhypercube\anninputs\lathypcub_{str(case_num)}\dsm2_ann_inputs_lathypcub_{str(case_num)}.xlsx"

    if case_num == 3 or case_num == 7:
        gate_per = '01JAN1953 - 01JAN2022'
        dcc_sub_part_f = 'DWR-DMS-DSM2'
    else:
        gate_per = f'01JAN{str(yearran-1)} - 01JAN{str(yearran)}'
        dcc_sub_part_f = f'DSP_LATHYPCUB_{str(case_num)}_202309'

    dfinps, dfouts = annutils.read_and_split(aug_input_file, num_sheets, observed_stations_ordered_by_median)
    dfinps = annutils.create_antecedent_inputs(dfinps,ndays=ndays,window_size=window_size,nwindows=nwindows)
    dfinps, dfouts = annutils.synchronize(dfinps, dfouts)
    
    # needs to combine the DCC gate operation DSS codes into one column.
    dfinps.columns = [s.replace(dcc_sub_part_f, dcc_part_f) for s in dfinps.columns]
    dfouts.columns = [s.replace(dcc_sub_part_f, dcc_part_f) for s in dfouts.columns]
    dfinps.columns = [s.replace(gate_per, '01JAN1953 - 01JAN2020') for s in dfinps.columns]
    dfouts.columns = [s.replace(gate_per, '01JAN1953 - 01JAN2020') for s in dfouts.columns]
    dfinps.columns = [s.replace('1DAY', 'IR-YEAR') for s in dfinps.columns]
    dfouts.columns = [s.replace('1DAY', 'IR-YEAR') for s in dfouts.columns]
    
    X_df = pd.concat([X_df, dfinps], axis=0)
    Y_df = pd.concat([Y_df, dfouts], axis=0)

train_X = annutils.include(X_df, aug_data)
train_Y = annutils.include(Y_df, aug_data)

# Add test data
X_df= None
Y_df= None

for data_file in tqdm(test_files):
    data_path = data_file
    dfinps, dfouts = annutils.read_and_split(data_path, num_sheets, observed_stations_ordered_by_median)
    dfinps = annutils.create_antecedent_inputs(dfinps,ndays=ndays,window_size=window_size,nwindows=nwindows)
    dfinps, dfouts = annutils.synchronize(dfinps, dfouts)
    X_df = pd.concat([X_df, dfinps], axis=0)
    Y_df = pd.concat([Y_df, dfouts], axis=0)

test_X = annutils.include(X_df, test_data)
test_Y = annutils.include(Y_df, test_data)


print(f'non-overlapping columns: {len(set(train_X.columns) ^ set(test_X.columns))}')

train_X.to_csv(os.path.join("Experiments", experiment, "train_X.csv"), compression=compression_opts)
train_Y.to_csv(os.path.join("Experiments", experiment, "train_Y.csv"), compression=compression_opts)
test_X.to_csv(os.path.join("Experiments", experiment, "test_X.csv"), compression=compression_opts)
test_Y.to_csv(os.path.join("Experiments", experiment, "test_Y.csv"), compression=compression_opts)

print(f"Finished compiling inputs for {experiment} experiment")

  return pd.concat([df.loc[start:end] for start, end in start_and_end])
  return pd.concat([df.loc[start:end] for start, end in start_and_end])
100%|██████████| 1/1 [00:00<00:00,  1.15it/s]


non-overlapping columns: 0
Finished compiling inputs for latinhypercube experiment


# Experiment: Perturbed Latinhypercube with tidal shift

This has 7 years using 2008, 2010, 2012, 2014 with variations of perturbed data but always has a perturbed tidal cycle

In [5]:
# Make a dir named 4years_perturbhist
experiment = 'lathypcub_tideshift'

test_files = ["dsm2_ann_inputs_base.xlsx"]
test_data = [
    ('2007-10-1','2008-9-30'),
    ('2009-10-1','2010-9-30'),
    ('2011-10-1','2012-9-30'),
    ('2013-10-1','2014-9-30')
]
numbers = {1: 2014,
            2: 2010,
            3: 2008,
            4: 2014,
            5: 2012,
            6: 2010,
            7: 2008
}
ndays=118
window_size=0
nwindows=0

if not os.path.exists("Experiments/" + experiment):
    os.mkdir("Experiments/" + experiment)

X_df= None
Y_df= None

dcc_part_f = 'DWR-DMS-DSM2'

for case_num, yearran in numbers.items():

    aug_data = [(f'{str(yearran)}-1-1',f'{str(yearran)}-12-31')]
    
    aug_input_file = fr"D:\projects\delta_salinity\model\dsm2\DSP_DSM2_202307\lathypcub_tideshift\anninputs\lathypcub_tideshift_{str(case_num)}\dsm2_ann_inputs_lathypcub_tideshift_{str(case_num)}.xlsx"

    if case_num == 3 or case_num == 7:
        gate_per = '01JAN1953 - 01JAN2022'
        dcc_sub_part_f = 'DWR-DMS-DSM2'
    else:
        gate_per = f'01JAN{str(yearran-1)} - 01JAN{str(yearran)}'
        dcc_sub_part_f = f'DSP_LATHYPCUB_{str(case_num)}_202309'

    dfinps, dfouts = annutils.read_and_split(aug_input_file, num_sheets, observed_stations_ordered_by_median)
    dfinps = annutils.create_antecedent_inputs(dfinps,ndays=ndays,window_size=window_size,nwindows=nwindows)
    dfinps, dfouts = annutils.synchronize(dfinps, dfouts)
    
    # needs to combine the DCC gate operation DSS codes into one column.
    dfinps.columns = [s.replace(dcc_sub_part_f, dcc_part_f) for s in dfinps.columns]
    dfouts.columns = [s.replace(dcc_sub_part_f, dcc_part_f) for s in dfouts.columns]
    dfinps.columns = [s.replace(gate_per, '01JAN1953 - 01JAN2020') for s in dfinps.columns]
    dfouts.columns = [s.replace(gate_per, '01JAN1953 - 01JAN2020') for s in dfouts.columns]
    dfinps.columns = [s.replace('1DAY', 'IR-YEAR') for s in dfinps.columns]
    dfouts.columns = [s.replace('1DAY', 'IR-YEAR') for s in dfouts.columns]
    
    X_df = pd.concat([X_df, dfinps], axis=0)
    Y_df = pd.concat([Y_df, dfouts], axis=0)

train_X = annutils.include(X_df, aug_data)
train_Y = annutils.include(Y_df, aug_data)

# Add test data
X_df= None
Y_df= None

for data_file in tqdm(test_files):
    data_path = data_file
    dfinps, dfouts = annutils.read_and_split(data_path, num_sheets, observed_stations_ordered_by_median)
    dfinps = annutils.create_antecedent_inputs(dfinps,ndays=ndays,window_size=window_size,nwindows=nwindows)
    dfinps, dfouts = annutils.synchronize(dfinps, dfouts)
    X_df = pd.concat([X_df, dfinps], axis=0)
    Y_df = pd.concat([Y_df, dfouts], axis=0)

test_X = annutils.include(X_df, test_data)
test_Y = annutils.include(Y_df, test_data)


print(f'non-overlapping columns: {len(set(train_X.columns) ^ set(test_X.columns))}')

train_X.to_csv(os.path.join("Experiments", experiment, "train_X.csv"), compression=compression_opts)
train_Y.to_csv(os.path.join("Experiments", experiment, "train_Y.csv"), compression=compression_opts)
test_X.to_csv(os.path.join("Experiments", experiment, "test_X.csv"), compression=compression_opts)
test_Y.to_csv(os.path.join("Experiments", experiment, "test_Y.csv"), compression=compression_opts)

print(f"Finished compiling inputs for {experiment} experiment")

  return pd.concat([df.loc[start:end] for start, end in start_and_end])
  return pd.concat([df.loc[start:end] for start, end in start_and_end])
100%|██████████| 1/1 [00:00<00:00,  1.09it/s]


non-overlapping columns: 0
Finished compiling inputs for lathypcub_tideshift experiment


# Experiment: Perturbed Latinhypercube no tidal shift

This has 7 years using 2008, 2010, 2012, 2014 with variations of perturbed data but never has a perturbed tidal cycle

In [8]:
# Make a dir named 4years_perturbhist
experiment = 'lathypcub_regtide'

test_files = ["dsm2_ann_inputs_base.xlsx"]
test_data = [
    ('2007-10-1','2008-9-30'),
    ('2009-10-1','2010-9-30'),
    ('2011-10-1','2012-9-30'),
    ('2013-10-1','2014-9-30')
]
numbers = {1: 2014,
            2: 2010,
            3: 2008,
            4: 2014,
            5: 2012,
            6: 2010,
            7: 2008
}
ndays=118
window_size=0
nwindows=0

if not os.path.exists("Experiments/" + experiment):
    os.mkdir("Experiments/" + experiment)

X_df= None
Y_df= None

dcc_part_f = 'DWR-DMS-DSM2'

for case_num, yearran in numbers.items():

    aug_data = [(f'{str(yearran)}-1-1',f'{str(yearran)}-12-31')]
    
    aug_input_file = fr"D:\projects\delta_salinity\model\dsm2\DSP_DSM2_202307\lathypcub_regtide\anninputs\lathypcub_regtide_{str(case_num)}\dsm2_ann_inputs_lathypcub_regtide_{str(case_num)}.xlsx"

    if case_num == 3 or case_num == 7:
        gate_per = '01JAN1953 - 01JAN2022'
        dcc_sub_part_f = 'DWR-DMS-DSM2'
    else:
        gate_per = f'01JAN{str(yearran-1)} - 01JAN{str(yearran)}'
        dcc_sub_part_f = f'DSP_LATHYPCUB_{str(case_num)}_202309'

    dfinps, dfouts = annutils.read_and_split(aug_input_file, num_sheets, observed_stations_ordered_by_median)
    dfinps = annutils.create_antecedent_inputs(dfinps,ndays=ndays,window_size=window_size,nwindows=nwindows)
    dfinps, dfouts = annutils.synchronize(dfinps, dfouts)
    
    # needs to combine the DCC gate operation DSS codes into one column.
    dfinps.columns = [s.replace(dcc_sub_part_f, dcc_part_f) for s in dfinps.columns]
    dfouts.columns = [s.replace(dcc_sub_part_f, dcc_part_f) for s in dfouts.columns]
    dfinps.columns = [s.replace(gate_per, '01JAN1953 - 01JAN2020') for s in dfinps.columns]
    dfouts.columns = [s.replace(gate_per, '01JAN1953 - 01JAN2020') for s in dfouts.columns]
    dfinps.columns = [s.replace('1DAY', 'IR-YEAR') for s in dfinps.columns]
    dfouts.columns = [s.replace('1DAY', 'IR-YEAR') for s in dfouts.columns]
    
    X_df = pd.concat([X_df, dfinps], axis=0)
    Y_df = pd.concat([Y_df, dfouts], axis=0)

train_X = annutils.include(X_df, aug_data)
train_Y = annutils.include(Y_df, aug_data)

# Add test data
X_df= None
Y_df= None

for data_file in tqdm(test_files):
    data_path = data_file
    dfinps, dfouts = annutils.read_and_split(data_path, num_sheets, observed_stations_ordered_by_median)
    dfinps = annutils.create_antecedent_inputs(dfinps,ndays=ndays,window_size=window_size,nwindows=nwindows)
    dfinps, dfouts = annutils.synchronize(dfinps, dfouts)
    X_df = pd.concat([X_df, dfinps], axis=0)
    Y_df = pd.concat([Y_df, dfouts], axis=0)

test_X = annutils.include(X_df, test_data)
test_Y = annutils.include(Y_df, test_data)


print(f'non-overlapping columns: {len(set(train_X.columns) ^ set(test_X.columns))}')

train_X.to_csv(os.path.join("Experiments", experiment, "train_X.csv"), compression=compression_opts)
train_Y.to_csv(os.path.join("Experiments", experiment, "train_Y.csv"), compression=compression_opts)
test_X.to_csv(os.path.join("Experiments", experiment, "test_X.csv"), compression=compression_opts)
test_Y.to_csv(os.path.join("Experiments", experiment, "test_Y.csv"), compression=compression_opts)

print(f"Finished compiling inputs for {experiment} experiment")

  return pd.concat([df.loc[start:end] for start, end in start_and_end])
  return pd.concat([df.loc[start:end] for start, end in start_and_end])
100%|██████████| 1/1 [00:00<00:00,  1.41it/s]


non-overlapping columns: 0
Finished compiling inputs for lathypcub_regtide experiment


# Experiment: Perturbed Latinhypercube 7 day tidal shift

This has 7 years using 2008, 2010, 2012, 2014 with variations of perturbed data but always has a 7 day tidal shift

In [9]:
# Make a dir named 4years_perturbhist
experiment = 'lathypcub_7tide'

test_files = ["dsm2_ann_inputs_base.xlsx"]
test_data = [
    ('2007-10-1','2008-9-30'),
    ('2009-10-1','2010-9-30'),
    ('2011-10-1','2012-9-30'),
    ('2013-10-1','2014-9-30')
]
numbers = {1: 2014,
            2: 2010,
            3: 2008,
            4: 2014,
            5: 2012,
            6: 2010,
            7: 2008
}
ndays=118
window_size=0
nwindows=0

if not os.path.exists("Experiments/" + experiment):
    os.mkdir("Experiments/" + experiment)

X_df= None
Y_df= None

dcc_part_f = 'DWR-DMS-DSM2'

for case_num, yearran in numbers.items():

    aug_data = [(f'{str(yearran)}-1-1',f'{str(yearran)}-12-31')]
    
    aug_input_file = fr"D:\projects\delta_salinity\model\dsm2\DSP_DSM2_202307\lathypcub_7tide\anninputs\lathypcub_7tide_{str(case_num)}\dsm2_ann_inputs_lathypcub_7tide_{str(case_num)}.xlsx"

    if case_num == 3 or case_num == 7:
        gate_per = '01JAN1953 - 01JAN2022'
        dcc_sub_part_f = 'DWR-DMS-DSM2'
    else:
        gate_per = f'01JAN{str(yearran-1)} - 01JAN{str(yearran)}'
        dcc_sub_part_f = f'DSP_LATHYPCUB_{str(case_num)}_202309'

    dfinps, dfouts = annutils.read_and_split(aug_input_file, num_sheets, observed_stations_ordered_by_median)
    dfinps = annutils.create_antecedent_inputs(dfinps,ndays=ndays,window_size=window_size,nwindows=nwindows)
    dfinps, dfouts = annutils.synchronize(dfinps, dfouts)
    
    # needs to combine the DCC gate operation DSS codes into one column.
    dfinps.columns = [s.replace(dcc_sub_part_f, dcc_part_f) for s in dfinps.columns]
    dfouts.columns = [s.replace(dcc_sub_part_f, dcc_part_f) for s in dfouts.columns]
    dfinps.columns = [s.replace(gate_per, '01JAN1953 - 01JAN2020') for s in dfinps.columns]
    dfouts.columns = [s.replace(gate_per, '01JAN1953 - 01JAN2020') for s in dfouts.columns]
    dfinps.columns = [s.replace('1DAY', 'IR-YEAR') for s in dfinps.columns]
    dfouts.columns = [s.replace('1DAY', 'IR-YEAR') for s in dfouts.columns]
    
    X_df = pd.concat([X_df, dfinps], axis=0)
    Y_df = pd.concat([Y_df, dfouts], axis=0)

train_X = annutils.include(X_df, aug_data)
train_Y = annutils.include(Y_df, aug_data)

# Add test data
X_df= None
Y_df= None

for data_file in tqdm(test_files):
    data_path = data_file
    dfinps, dfouts = annutils.read_and_split(data_path, num_sheets, observed_stations_ordered_by_median)
    dfinps = annutils.create_antecedent_inputs(dfinps,ndays=ndays,window_size=window_size,nwindows=nwindows)
    dfinps, dfouts = annutils.synchronize(dfinps, dfouts)
    X_df = pd.concat([X_df, dfinps], axis=0)
    Y_df = pd.concat([Y_df, dfouts], axis=0)

test_X = annutils.include(X_df, test_data)
test_Y = annutils.include(Y_df, test_data)


print(f'non-overlapping columns: {len(set(train_X.columns) ^ set(test_X.columns))}')

train_X.to_csv(os.path.join("Experiments", experiment, "train_X.csv"), compression=compression_opts)
train_Y.to_csv(os.path.join("Experiments", experiment, "train_Y.csv"), compression=compression_opts)
test_X.to_csv(os.path.join("Experiments", experiment, "test_X.csv"), compression=compression_opts)
test_Y.to_csv(os.path.join("Experiments", experiment, "test_Y.csv"), compression=compression_opts)

print(f"Finished compiling inputs for {experiment} experiment")

  return pd.concat([df.loc[start:end] for start, end in start_and_end])
  return pd.concat([df.loc[start:end] for start, end in start_and_end])
100%|██████████| 1/1 [00:00<00:00,  1.49it/s]


non-overlapping columns: 0
Finished compiling inputs for lathypcub_7tide experiment


# Experiment: 6 Years with Augmented data
This is the same as the previous experiment but adds in the augmented data.
The augmented data:
    sac + 15 days
    sac - 15 days
    sjr + 15 days
    sjr - 15 days
    sac + 20%
    sac - 20%
    sjr + 20%
    sjr - 20%

 If we do this right the training data should be approx 6 * 365 * 9 =~ 19710 rows of data

In [7]:
# Make a dir named 6years
experiment_name = "6yearsAugmented"
if not os.path.exists("Experiments/" + experiment_name):
    os.mkdir("Experiments/" + experiment_name)

picked_training_years = [
    ('2007-10-1','2008-9-30'),
    ('2008-10-1','2009-9-30'),
    ('2010-10-1','2011-9-30'),
    ('2011-10-1','2012-9-30'),
    ('2013-10-1','2014-9-30'),
    ('2016-10-1','2017-9-30')
]

input_files = ["dsm2_ann_inputs_base.xlsx",
               "dsm2_ann_inputs_rsacminus15day.xlsx",
               "dsm2_ann_inputs_rsacminus20pct.xlsx",
               "dsm2_ann_inputs_rsacplus15day.xlsx",
               "dsm2_ann_inputs_rsacplus20pct.xlsx",
               "dsm2_ann_inputs_rsanminus15day.xlsx",
               "dsm2_ann_inputs_rsanminus20pct.xlsx",
               "dsm2_ann_inputs_rsanplus15day.xlsx",
               "dsm2_ann_inputs_rsanplus20pct.xlsx"]

X_df= None
Y_df= None

ndays = 118
window_size = 0
nwindows = 0

for data_file in tqdm(input_files):
    data_path = os.path.join(local_root_path,data_file)
    dfinps, dfouts = annutils.read_and_split(data_path, num_sheets, observed_stations_ordered_by_median)
    dfinps = annutils.create_antecedent_inputs(dfinps,ndays=ndays,window_size=window_size,nwindows=nwindows)
    dfinps, dfouts = annutils.synchronize(dfinps, dfouts)
    X_df = pd.concat([X_df, dfinps], axis=0)
    Y_df = pd.concat([Y_df, dfouts], axis=0)

# now X_df should have 118 * 8 = 944 input features and Y_df should have 23 target salinity values


train_X = annutils.include(X_df, picked_training_years)
train_Y = annutils.include(Y_df, picked_training_years)

test_X = annutils.exclude(X_df, picked_training_years)
test_Y = annutils.exclude(Y_df, picked_training_years)

train_X.to_csv(os.path.join("Experiments", experiment_name, "train_X.csv"), compression=compression_opts)
train_Y.to_csv(os.path.join("Experiments", experiment_name, "train_Y.csv"), compression=compression_opts)
test_X.to_csv(os.path.join("Experiments", experiment_name, "test_X.csv"), compression=compression_opts)
test_Y.to_csv(os.path.join("Experiments", experiment_name, "test_Y.csv"), compression=compression_opts)



100%|██████████| 9/9 [00:08<00:00,  1.07it/s]


# Experiment: Colab with Ryan's method
This is the same as the 6 years + augmented data but done with the list of "picked training years" in the colab script
The augmented data:
    sac + 15 days
    sac - 15 days
    sjr + 15 days
    sjr - 15 days
    sac + 20%
    sac - 20%
    sjr + 20%
    sjr - 20%

In [8]:
# Make a dir named colab_simple
experiment_name = "colab_simple"
if not os.path.exists("Experiments/" + experiment_name):
    os.mkdir("Experiments/" + experiment_name)

picked_training_years = [('1990-10-1','1991-9-30'),
                         ('1992-10-1','1995-9-30'),
                         ('1996-10-1','1998-9-30'),
                         ('1999-10-1','2003-9-30'),
                         ('2004-10-1','2006-9-30'),
                         ('2007-10-1','2010-9-30'),
                         ('2011-10-1','2013-9-30'),
                         ('2014-10-1','2016-9-30'),
                         ('2017-10-1','2019-9-30'),]

input_files = ["dsm2_ann_inputs_base.xlsx",
               "dsm2_ann_inputs_rsacminus15day.xlsx",
               "dsm2_ann_inputs_rsacminus20pct.xlsx",
               "dsm2_ann_inputs_rsacplus15day.xlsx",
               "dsm2_ann_inputs_rsacplus20pct.xlsx",
               "dsm2_ann_inputs_rsanminus15day.xlsx",
               "dsm2_ann_inputs_rsanminus20pct.xlsx",
               "dsm2_ann_inputs_rsanplus15day.xlsx",
               "dsm2_ann_inputs_rsanplus20pct.xlsx"]

X_df= None
Y_df= None

ndays = 118
window_size = 0
nwindows = 0

for data_file in tqdm(input_files):
    data_path = os.path.join(local_root_path,data_file)
    dfinps, dfouts = annutils.read_and_split(data_path, num_sheets, observed_stations_ordered_by_median)
    dfinps = annutils.create_antecedent_inputs(dfinps,ndays=ndays,window_size=window_size,nwindows=nwindows)
    dfinps, dfouts = annutils.synchronize(dfinps, dfouts)
    X_df = pd.concat([X_df, dfinps], axis=0)
    Y_df = pd.concat([Y_df, dfouts], axis=0)

# now X_df should have 118 * 8 = 944 input features and Y_df should have 23 target salinity values


train_X = annutils.include(X_df, picked_training_years)
train_Y = annutils.include(Y_df, picked_training_years)

test_X = annutils.exclude(X_df, picked_training_years)
test_Y = annutils.exclude(Y_df, picked_training_years)

train_X.to_csv(os.path.join("Experiments", experiment_name, "train_X.csv"), compression=compression_opts)
train_Y.to_csv(os.path.join("Experiments", experiment_name, "train_Y.csv"), compression=compression_opts)
test_X.to_csv(os.path.join("Experiments", experiment_name, "test_X.csv"), compression=compression_opts)
test_Y.to_csv(os.path.join("Experiments", experiment_name, "test_Y.csv"), compression=compression_opts)


100%|██████████| 9/9 [00:06<00:00,  1.33it/s]


# Experiment: Colab with Ryan's method - minus 2015
This is the same as the 6 years + augmented data but done with the list of "picked training years" in the colab script
The augmented data:
    sac + 15 days
    sac - 15 days
    sjr + 15 days
    sjr - 15 days
    sac + 20%
    sac - 20%
    sjr + 20%
    sjr - 20%

In [9]:
# Make a dir named colab_simple_wo2015
experiment_name = "colab_simple_wo2015"
if not os.path.exists("Experiments/" + experiment_name):
    os.mkdir("Experiments/" + experiment_name)

picked_training_years = [('1990-10-1','1991-9-30'),
                         ('1992-10-1','1995-9-30'),
                         ('1996-10-1','1998-9-30'),
                         ('1999-10-1','2003-9-30'),
                         ('2004-10-1','2006-9-30'),
                         ('2007-10-1','2010-9-30'),
                         ('2011-10-1','2013-9-30'),
                         ('2015-10-1','2016-9-30'),
                         ('2017-10-1','2019-9-30'),]

input_files = ["dsm2_ann_inputs_base.xlsx",
               "dsm2_ann_inputs_rsacminus15day.xlsx",
               "dsm2_ann_inputs_rsacminus20pct.xlsx",
               "dsm2_ann_inputs_rsacplus15day.xlsx",
               "dsm2_ann_inputs_rsacplus20pct.xlsx",
               "dsm2_ann_inputs_rsanminus15day.xlsx",
               "dsm2_ann_inputs_rsanminus20pct.xlsx",
               "dsm2_ann_inputs_rsanplus15day.xlsx",
               "dsm2_ann_inputs_rsanplus20pct.xlsx"]

X_df= None
Y_df= None

ndays = 118
window_size = 0
nwindows = 0

for data_file in tqdm(input_files):
    data_path = os.path.join(local_root_path,data_file)
    dfinps, dfouts = annutils.read_and_split(data_path, num_sheets, observed_stations_ordered_by_median)
    dfinps = annutils.create_antecedent_inputs(dfinps,ndays=ndays,window_size=window_size,nwindows=nwindows)
    dfinps, dfouts = annutils.synchronize(dfinps, dfouts)
    X_df = pd.concat([X_df, dfinps], axis=0)
    Y_df = pd.concat([Y_df, dfouts], axis=0)

# now X_df should have 118 * 8 = 944 input features and Y_df should have 23 target salinity values


train_X = annutils.include(X_df, picked_training_years)
train_Y = annutils.include(Y_df, picked_training_years)

test_X = annutils.exclude(X_df, picked_training_years)
test_Y = annutils.exclude(Y_df, picked_training_years)

train_X.to_csv(os.path.join("Experiments", experiment_name, "train_X.csv"), compression=compression_opts)
train_Y.to_csv(os.path.join("Experiments", experiment_name, "train_Y.csv"), compression=compression_opts)
test_X.to_csv(os.path.join("Experiments", experiment_name, "test_X.csv"), compression=compression_opts)
test_Y.to_csv(os.path.join("Experiments", experiment_name, "test_Y.csv"), compression=compression_opts)

100%|██████████| 9/9 [00:07<00:00,  1.17it/s]


# Experiment: Colab standard
This is how the Colab notebook builds the datasets.



In [16]:
experiment_name = "colab"
if not os.path.exists("Experiments/" + experiment_name):
    os.mkdir("Experiments/" + experiment_name)

train_data = ["dsm2_ann_inputs_rsacminus15day.xlsx",
              "dsm2_ann_inputs_rsacminus20pct.xlsx",
              "dsm2_ann_inputs_rsacplus15day.xlsx",
              "dsm2_ann_inputs_rsacplus20pct.xlsx",
              "dsm2_ann_inputs_rsanminus15day.xlsx",
              "dsm2_ann_inputs_rsanminus20pct.xlsx",
              "dsm2_ann_inputs_rsanplus15day.xlsx",
              "dsm2_ann_inputs_rsanplus20pct.xlsx",
              ]

test_data = {'dcc0': "dsm2_ann_inputs_dcc0.xlsx",
             'smscg1': "dsm2_ann_inputs_smscg1.xlsx",
             'dcc1': "dsm2_ann_inputs_dcc1.xlsx",
             'smscg0': "dsm2_ann_inputs_smscg0.xlsx"}

extra_data = {'observed': "observed_data_daily.xlsx"}
which_part_for_test = 'last'
extra_data_test_ratio = 0.3

ndays = 118
window_size = 0
nwindows = 0



def read_training_data(train_data):
    x_train = None
    y_train = None
    for data_file in tqdm(train_data):
        data_path = os.path.join(local_root_path, data_file)
        dfinps, dfouts = annutils.read_and_split(data_path, num_sheets, observed_stations_ordered_by_median)

        # create tuple of calibration and validation sets and the xscaler and yscaler on the combined inputs
        if x_train is None:
            (x_train, y_train), (_, _)  = \
                annutils.create_training_sets_no_scaling([dfinps],
                                              [dfouts],
                                              train_frac=1,
                                              ndays=ndays, window_size=window_size, nwindows=nwindows  )


        else:
            (xc, yc), (_, _) = \
                annutils.create_training_sets_no_scaling([dfinps],
                                              [dfouts],
                                              train_frac=1,
                                              ndays=ndays, window_size=window_size, nwindows=nwindows )

            x_train = pd.concat([x_train, xc], axis=0)
            y_train = pd.concat([y_train, yc], axis=0)
            del xc, yc
    return x_train, y_train

train_X, train_Y = read_training_data(train_data)
test_X, test_Y = read_training_data(test_data.values())


######### Read extra observed dataset ###############
for data_file in tqdm(extra_data.values()):
    data_path = os.path.join(local_root_path, data_file)

    # print("Starting read_excel calls:", data_path)
    dflist = [annutils.read_excel_sheet(data_path, i) for i in range(num_sheets)]

    df_inpout = pd.concat(dflist[0:num_sheets], axis=1).dropna(axis=0)
    col_mask = df_inpout.columns.isin(dflist[num_sheets - 1].columns)
    dfinps = df_inpout.loc[:, ~col_mask]
    dfouts = df_inpout.loc[:, col_mask]
    # dfouts = dfouts[output_stations]  # out_stations is None here...

    start_year = max(dfinps.index[0].year, dfouts.index[0].year)
    end_year = min(dfinps.index[-1].year, dfouts.index[-1].year)

    if which_part_for_test == 'last':
        calib_slice = slice(str(start_year),
                            str(int(start_year + (1 - extra_data_test_ratio) * (end_year - start_year))))
        valid_slice = slice(str(int(start_year + (1 - extra_data_test_ratio) * (end_year - start_year)) + 1),
                            str(end_year))
    elif which_part_for_test == 'first':
        calib_slice = slice(str(int(start_year + (1 - extra_data_test_ratio) * (end_year - start_year)) + 1),
                            str(end_year))
        valid_slice = slice(str(start_year),
                            str(int(start_year + (1 - extra_data_test_ratio) * (end_year - start_year))))
    elif which_part_for_test == 'middle':
        calib_slice = [slice(str(start_year),
                             str(int(start_year + (1 - extra_data_test_ratio) / 2 * (end_year - start_year)))),
                       slice(str(int(start_year + (1 + extra_data_test_ratio) / 2 * (end_year - start_year) + 1)),
                             str(end_year))]
        valid_slice = slice(str(int(start_year + (1 - extra_data_test_ratio) / 2 * (end_year - start_year)) + 1),
                            str(int(start_year + (1 + extra_data_test_ratio) / 2 * (end_year - start_year))))
    elif which_part_for_test == 'manual' and picked_training_years is not None:
        calib_slice = [slice(str(start_year), str(end_year)) for (start_year, end_year) in picked_training_years]
        valid_slice = [slice(start_year, end_year) for ((_, start_year), (end_year, _)) in
                       zip([(None, '1989-10-1'), ] + picked_training_years,
                           picked_training_years + [('2020-9-30', None), ])]
    else:
        raise Exception('Unknown data splitting method')

    # create tuple of calibration and validation sets and the xscaler and yscaler on the combined inputs
    (x_extra_train, y_extra_train), (x_extra_test, y_extra_test) = \
        annutils.create_training_sets_no_scaling([dfinps],
                                      [dfouts],
                                      calib_slice=calib_slice,
                                      valid_slice=valid_slice,
                                      ndays=ndays, window_size=window_size, nwindows=nwindows,
                                      )

    train_X = pd.concat([train_X, x_extra_train], axis=0)
    train_Y = pd.concat([train_Y, y_extra_train], axis=0)
    test_X = pd.concat([test_X, x_extra_test], axis=0)
    test_Y = pd.concat([test_Y, y_extra_test], axis=0)

print("Done")
# takes about 2minutes...
train_X.to_csv(os.path.join("Experiments", experiment_name, "train_X.csv"), compression=compression_opts)
train_Y.to_csv(os.path.join("Experiments", experiment_name, "train_Y.csv"), compression=compression_opts)
test_X.to_csv(os.path.join("Experiments", experiment_name, "test_X.csv"), compression=compression_opts)
test_Y.to_csv(os.path.join("Experiments", experiment_name, "test_Y.csv"), compression=compression_opts)

  0%|          | 0/8 [00:00<?, ?it/s]

Randomly selecting 10803 samples for training, 0 for test
Randomly selecting 10803 samples for training, 0 for test
Randomly selecting 10803 samples for training, 0 for test
Randomly selecting 10803 samples for training, 0 for test
Randomly selecting 10803 samples for training, 0 for test
Randomly selecting 10803 samples for training, 0 for test
Randomly selecting 10803 samples for training, 0 for test
Randomly selecting 10803 samples for training, 0 for test


  0%|          | 0/4 [00:00<?, ?it/s]

Randomly selecting 10803 samples for training, 0 for test
Randomly selecting 10803 samples for training, 0 for test
Randomly selecting 10803 samples for training, 0 for test
Randomly selecting 10803 samples for training, 0 for test


  0%|          | 0/1 [00:00<?, ?it/s]

Done


# Experiment: Colab minus 2015
This is how the Colab notebook builds the datasets. But without the year 2015

Name experiment: colab_wo2015



In [7]:
experiment_name = "colab_wo2015"
if not os.path.exists("Experiments/" + experiment_name):
    os.mkdir("Experiments/" + experiment_name)

picked_training_years = [('1990-10-1','1991-9-30'),
                         ('1992-10-1','1995-9-30'),
                         ('1996-10-1','1998-9-30'),
                         ('1999-10-1','2003-9-30'),
                         ('2004-10-1','2006-9-30'),
                         ('2007-10-1','2010-9-30'),
                         ('2011-10-1','2013-9-30'),
                         ('2015-10-1','2016-9-30'),
                         ('2017-10-1','2019-9-30'),]

train_data = ["dsm2_ann_inputs_rsacminus15day.xlsx",
              "dsm2_ann_inputs_rsacminus20pct.xlsx",
              "dsm2_ann_inputs_rsacplus15day.xlsx",
              "dsm2_ann_inputs_rsacplus20pct.xlsx",
              "dsm2_ann_inputs_rsanminus15day.xlsx",
              "dsm2_ann_inputs_rsanminus20pct.xlsx",
              "dsm2_ann_inputs_rsanplus15day.xlsx",
              "dsm2_ann_inputs_rsanplus20pct.xlsx",
              ]

test_data = {'dcc0': "dsm2_ann_inputs_dcc0.xlsx",
             'smscg1': "dsm2_ann_inputs_smscg1.xlsx",
             'dcc1': "dsm2_ann_inputs_dcc1.xlsx",
             'smscg0': "dsm2_ann_inputs_smscg0.xlsx"}

extra_data = {'observed': "observed_data_daily.xlsx"}
which_part_for_test = 'manual'
extra_data_test_ratio = 0.3

ndays = 118
window_size = 0
nwindows = 0



def read_training_data(train_data):
    x_train = None
    y_train = None
    for data_file in tqdm(train_data):
        data_path = os.path.join(local_root_path, data_file)
        dfinps, dfouts = annutils.read_and_split(data_path, num_sheets, observed_stations_ordered_by_median)

        # create tuple of calibration and validation sets and the xscaler and yscaler on the combined inputs
        if x_train is None:
            (x_train, y_train), (_, _)  = \
                annutils.create_training_sets_no_scaling([dfinps],
                                              [dfouts],
                                              train_frac=1,
                                              ndays=ndays, window_size=window_size, nwindows=nwindows  )


        else:
            (xc, yc), (_, _) = \
                annutils.create_training_sets_no_scaling([dfinps],
                                              [dfouts],
                                              train_frac=1,
                                              ndays=ndays, window_size=window_size, nwindows=nwindows )

            x_train = pd.concat([x_train, xc], axis=0)
            y_train = pd.concat([y_train, yc], axis=0)
            del xc, yc
    return x_train, y_train

train_X, train_Y = read_training_data(train_data)
test_X, test_Y = read_training_data(test_data.values())


######### Read extra observed dataset ###############
for data_file in tqdm(extra_data.values()):
    data_path = os.path.join(local_root_path, data_file)

    # print("Starting read_excel calls:", data_path)
    dflist = [annutils.read_excel_sheet(data_path, i) for i in range(num_sheets)]

    df_inpout = pd.concat(dflist[0:num_sheets], axis=1).dropna(axis=0)
    col_mask = df_inpout.columns.isin(dflist[num_sheets - 1].columns)
    dfinps = df_inpout.loc[:, ~col_mask]
    dfouts = df_inpout.loc[:, col_mask]
    # dfouts = dfouts[output_stations]  # out_stations is None here...

    start_year = max(dfinps.index[0].year, dfouts.index[0].year)
    end_year = min(dfinps.index[-1].year, dfouts.index[-1].year)

    if which_part_for_test == 'last':
        calib_slice = slice(str(start_year),
                            str(int(start_year + (1 - extra_data_test_ratio) * (end_year - start_year))))
        valid_slice = slice(str(int(start_year + (1 - extra_data_test_ratio) * (end_year - start_year)) + 1),
                            str(end_year))
    elif which_part_for_test == 'first':
        calib_slice = slice(str(int(start_year + (1 - extra_data_test_ratio) * (end_year - start_year)) + 1),
                            str(end_year))
        valid_slice = slice(str(start_year),
                            str(int(start_year + (1 - extra_data_test_ratio) * (end_year - start_year))))
    elif which_part_for_test == 'middle':
        calib_slice = [slice(str(start_year),
                             str(int(start_year + (1 - extra_data_test_ratio) / 2 * (end_year - start_year)))),
                       slice(str(int(start_year + (1 + extra_data_test_ratio) / 2 * (end_year - start_year) + 1)),
                             str(end_year))]
        valid_slice = slice(str(int(start_year + (1 - extra_data_test_ratio) / 2 * (end_year - start_year)) + 1),
                            str(int(start_year + (1 + extra_data_test_ratio) / 2 * (end_year - start_year))))
    elif which_part_for_test == 'manual' and picked_training_years is not None:
        calib_slice = [slice(str(start_year), str(end_year)) for (start_year, end_year) in picked_training_years]
        valid_slice = [slice(start_year, end_year) for ((_, start_year), (end_year, _)) in
                       zip([(None, '1989-10-1'), ] + picked_training_years,
                           picked_training_years + [('2020-9-30', None), ])]
    else:
        raise Exception('Unknown data splitting method')

    # create tuple of calibration and validation sets and the xscaler and yscaler on the combined inputs
    (x_extra_train, y_extra_train), (x_extra_test, y_extra_test) = \
        annutils.create_training_sets_no_scaling([dfinps],
                                      [dfouts],
                                      calib_slice=calib_slice,
                                      valid_slice=valid_slice,
                                      ndays=ndays, window_size=window_size, nwindows=nwindows,
                                      )

    train_X = pd.concat([train_X, x_extra_train], axis=0)
    train_Y = pd.concat([train_Y, y_extra_train], axis=0)
    test_X = pd.concat([test_X, x_extra_test], axis=0)
    test_Y = pd.concat([test_Y, y_extra_test], axis=0)

print("Done")
# takes about 2minutes...
train_X.to_csv(os.path.join("Experiments", experiment_name, "train_X.csv"), compression=compression_opts)
train_Y.to_csv(os.path.join("Experiments", experiment_name, "train_Y.csv"), compression=compression_opts)
test_X.to_csv(os.path.join("Experiments", experiment_name, "test_X.csv"), compression=compression_opts)
test_Y.to_csv(os.path.join("Experiments", experiment_name, "test_Y.csv"), compression=compression_opts)

 12%|█▎        | 1/8 [00:00<00:05,  1.20it/s]

Randomly selecting 10803 samples for training, 0 for test


 25%|██▌       | 2/8 [00:01<00:05,  1.10it/s]

Randomly selecting 10803 samples for training, 0 for test


 38%|███▊      | 3/8 [00:02<00:04,  1.04it/s]

Randomly selecting 10803 samples for training, 0 for test
Randomly selecting 10803 samples for training, 0 for test


 50%|█████     | 4/8 [00:03<00:04,  1.02s/it]

Randomly selecting 10803 samples for training, 0 for test


 62%|██████▎   | 5/8 [00:05<00:03,  1.06s/it]

Randomly selecting 10803 samples for training, 0 for test


 75%|███████▌  | 6/8 [00:06<00:02,  1.09s/it]

Randomly selecting 10803 samples for training, 0 for test


 88%|████████▊ | 7/8 [00:07<00:01,  1.14s/it]

Randomly selecting 10803 samples for training, 0 for test


100%|██████████| 8/8 [00:08<00:00,  1.08s/it]
 25%|██▌       | 1/4 [00:00<00:02,  1.07it/s]

Randomly selecting 10803 samples for training, 0 for test


 50%|█████     | 2/4 [00:01<00:01,  1.08it/s]

Randomly selecting 10803 samples for training, 0 for test


 75%|███████▌  | 3/4 [00:02<00:01,  1.03s/it]

Randomly selecting 10803 samples for training, 0 for test
Randomly selecting 10803 samples for training, 0 for test


100%|██████████| 4/4 [00:04<00:00,  1.02s/it]
100%|██████████| 1/1 [00:01<00:00,  1.20s/it]


Done
