# LOAD LIBRARIES

In [1]:
import sys
sys.path.append("../")
import config
import os
import copy
import random
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from datetime import datetime

# DEFINE DIRECTORIES

In [2]:
RAW_DIR = config.RAW_DIR
PREPROCESSED_DIR = config.PREPROCESSED_DIR

# LOAD DATA & REMOVE LEAP YEAR

In [3]:
data = np.load(os.path.join(RAW_DIR, "data.npy")).astype(np.float32)
data[data==config.unknown] = np.nan
date = np.load(os.path.join(RAW_DIR, "dates.npy"), allow_pickle=True)
print(date.shape, data.shape, "Original data")

date_df = pd.DataFrame(date)
date_df = pd.to_datetime(date_df[0], errors='coerce')
print(date_df.loc[date_df.dt.month == 2].loc[date_df.dt.day == 29])
date = np.delete(date, date_df.loc[date_df.dt.month == 2].loc[date_df.dt.day == 29].index.values)
data = np.delete(data, date_df.loc[date_df.dt.month == 2].loc[date_df.dt.day == 29].index.values, axis=1)
print(date.shape, data.shape, "Leap year removed data")

(16436,) (671, 16425, 26) Original data
516     1972-02-29
1977    1976-02-29
3438    1980-02-29
4899    1984-02-29
6360    1988-02-29
7821    1992-02-29
9282    1996-02-29
10743   2000-02-29
12204   2004-02-29
13665   2008-02-29
15126   2012-02-29
Name: 0, dtype: datetime64[ns]
(16425,) (671, 16414, 26) Leap year removed data


# CREATE TRAIN AND TEST BASINS

In [4]:
total_date = date[np.where(date==pd.Timestamp(config.test_year["start"], 10, 1, 0))[0][0]:np.where(date==pd.Timestamp(config.train_year["end"], 10, 1, 0))[0][0]]
total_data = data[:,np.where(date==pd.Timestamp(config.test_year["start"], 10, 1, 0))[0][0]:np.where(date==pd.Timestamp(config.train_year["end"], 10, 1, 0))[0][0]]
print(total_date.shape, total_data.shape)

(7300,) (671, 7300, 26)


In [5]:
node_flow = np.zeros((len(total_data), 2))
for node in range(len(total_data)):
	node_labels = total_data[node, :, config.output_channels]
	mean = np.mean(node_labels)
	median = np.median(node_labels)
	node_flow[node,0] = mean
	node_flow[node,1] = median
indices = np.argsort(node_flow[:,0])
mask = np.ones(len(node_flow)).astype(bool)
mask[::4] = 0
nan_indices = np.where(np.isnan(node_flow[:,0]))[0]
total_indices = np.array([index for index in indices if index not in nan_indices])
train_indices = np.array([index for index in indices[mask] if index not in nan_indices])
test_indices = np.array([index for index in indices[~mask] if index not in nan_indices])
print(len(total_indices), len(train_indices), len(test_indices))

total_node_flow = node_flow[total_indices]
print("Total\tMin Mean SF:{:.4f}\tNode:{}\tMin Median SF:{:.4f}\tNode:{}".format(np.min(total_node_flow[:,0]), total_indices[np.argmin(total_node_flow[:,0])], np.min(total_node_flow[:,1]), total_indices[np.argmin(total_node_flow[:,1])]))
print("Total\tMax Mean SF:{:.4f}\tNode:{}\tMax Median SF:{:.4f}\tNode:{}".format(np.max(total_node_flow[:,0]), total_indices[np.argmax(total_node_flow[:,0])], np.max(total_node_flow[:,1]), total_indices[np.argmax(total_node_flow[:,1])]))

train_node_flow = node_flow[train_indices]
print("Train\tMin Mean SF:{:.4f}\tNode:{}\tMin Median SF:{:.4f}\tNode:{}".format(np.min(train_node_flow[:,0]), train_indices[np.argmin(train_node_flow[:,0])], np.min(train_node_flow[:,1]), train_indices[np.argmin(train_node_flow[:,1])]))
print("Train\tMax Mean SF:{:.4f}\tNode:{}\tMax Median SF:{:.4f}\tNode:{}".format(np.max(train_node_flow[:,0]), train_indices[np.argmax(train_node_flow[:,0])], np.max(train_node_flow[:,1]), train_indices[np.argmax(train_node_flow[:,1])]))

test_node_flow = node_flow[test_indices]
print("Test\tMin Mean SF:{:.4f}\tNode:{}\tMin Median SF:{:.4f}\tNode:{}".format(np.min(test_node_flow[:,0]), test_indices[np.argmin(test_node_flow[:,0])], np.min(test_node_flow[:,1]), test_indices[np.argmin(test_node_flow[:,1])]))
print("Test\tMax Mean SF:{:.4f}\tNode:{}\tMax Median SF:{:.4f}\tNode:{}".format(np.max(test_node_flow[:,0]), test_indices[np.argmax(test_node_flow[:,0])], np.max(test_node_flow[:,1]), test_indices[np.argmax(test_node_flow[:,1])]))

376 282 94
Total	Min Mean SF:0.1849	Node:264	Min Median SF:0.0000	Node:358
Total	Max Mean SF:8.1869	Node:658	Max Median SF:5.2050	Node:33
Train	Min Mean SF:0.2108	Node:168	Min Median SF:0.0000	Node:358
Train	Max Mean SF:8.1869	Node:658	Max Median SF:5.2050	Node:33
Test	Min Mean SF:0.1849	Node:264	Min Median SF:0.0700	Node:206
Test	Max Mean SF:6.9310	Node:610	Max Median SF:4.0700	Node:336


In [6]:
# fig = plt.figure(figsize=(total_node_flow.shape[0]//4, total_node_flow.shape[0]//8))
# fontsize = 10
# width = 0.4
# ax = fig.add_subplot(111)
# ax.set_xticks(range(len(total_node_flow[:,0])))
# ax.set_xticklabels(total_indices, fontsize=fontsize, rotation=90)
# ax.bar(np.array(range(len(total_node_flow))), total_node_flow[:,0], width=width , color="Green", label = "Mean")
# ax.bar(np.array(range(len(total_node_flow))), total_node_flow[:,1], width=width , color="Blue", label = "Median")
# ax.legend(loc="upper left")
# plt.show()

# fig = plt.figure(figsize=(train_node_flow.shape[0]//4, train_node_flow.shape[0]//8))
# fontsize = 10
# width = 0.4
# ax = fig.add_subplot(111)
# ax.set_xticks(range(len(train_node_flow[:,0])))
# ax.set_xticklabels(train_indices, fontsize=fontsize, rotation=90)
# ax.bar(np.array(range(len(train_node_flow))), train_node_flow[:,0], width=width , color="Green", label = "Mean")
# ax.bar(np.array(range(len(train_node_flow))), train_node_flow[:,1], width=width , color="Blue", label = "Median")
# ax.legend(loc="upper left")
# plt.show()

# fig = plt.figure(figsize=(test_node_flow.shape[0]//4, test_node_flow.shape[0]//8))
# fontsize = 10
# width = 0.4
# ax = fig.add_subplot(111)
# ax.set_xticks(range(len(test_node_flow[:,0])))
# ax.set_xticklabels(test_indices, fontsize=fontsize, rotation=90)
# ax.bar(np.array(range(len(test_node_flow))), test_node_flow[:,0], width=width , color="Green", label = "Mean")
# ax.bar(np.array(range(len(test_node_flow))), test_node_flow[:,1], width=width , color="Blue", label = "Median")
# ax.legend(loc="upper left")
# plt.show()

# CREATE TRAIN & TEST DATA

In [7]:
train_date = date[np.where(date==pd.Timestamp(config.train_year["start"], 10, 1, 0))[0][0]:np.where(date==pd.Timestamp(config.train_year["end"], 10, 1, 0))[0][0]]
train_data = data[:,np.where(date==pd.Timestamp(config.train_year["start"], 10, 1, 0))[0][0]:np.where(date==pd.Timestamp(config.train_year["end"], 10, 1, 0))[0][0]]
print(train_date.shape, train_data.shape)

valid_date = date[np.where(date==pd.Timestamp(config.valid_year["start"], 10, 1, 0))[0][0]:np.where(date==pd.Timestamp(config.valid_year["end"], 10, 1, 0))[0][0]]
valid_data = data[:,np.where(date==pd.Timestamp(config.valid_year["start"], 10, 1, 0))[0][0]:np.where(date==pd.Timestamp(config.valid_year["end"], 10, 1, 0))[0][0]]
print(valid_date.shape, valid_data.shape)

test_date = date[np.where(date==pd.Timestamp(config.test_year["start"], 10, 1, 0))[0][0]:np.where(date==pd.Timestamp(config.test_year["end"], 10, 1, 0))[0][0]]
test_data = data[:,np.where(date==pd.Timestamp(config.test_year["start"], 10, 1, 0))[0][0]:np.where(date==pd.Timestamp(config.test_year["end"], 10, 1, 0))[0][0]]
print(test_date.shape, test_data.shape)

train_data_means = np.nanmean(train_data, axis=(0,1))
train_data_stds = np.nanstd(train_data, axis=(0,1))

(3650,) (671, 3650, 26)
(730,) (671, 730, 26)
(2920,) (671, 2920, 26)


# SAVE TRAIN & TEST DATA

In [8]:
print("Date:{}\tData:{}\tMeans:{}\tStds:{}\tTrain_Index:{}\tTest_Index:{}\tTrain".format(train_date.shape, train_data.shape, train_data_means.shape, train_data_stds.shape, train_indices.shape, test_indices.shape))
np.savez(os.path.join(PREPROCESSED_DIR, "train.npz"), date=train_date, data=train_data, train_data_means=train_data_means, train_data_stds=train_data_stds, train_index=train_indices, test_index=test_indices)
print("Date:{}\tData:{}\tMeans:{}\tStds:{}\tTrain_Index:{}\tTest_Index:{}\tValid".format(valid_date.shape, valid_data.shape, train_data_means.shape, train_data_stds.shape, train_indices.shape, test_indices.shape))
np.savez(os.path.join(PREPROCESSED_DIR, "valid.npz"), date=valid_date, data=valid_data, train_data_means=train_data_means, train_data_stds=train_data_stds, train_index=train_indices, test_index=test_indices)
print("Date:{}\tData:{}\tMeans:{}\tStds:{}\tTrain_Index:{}\tTest_Index:{}\tTest".format(test_date.shape, test_data.shape, train_data_means.shape, train_data_stds.shape, train_indices.shape, test_indices.shape))
np.savez(os.path.join(PREPROCESSED_DIR, "test.npz"), date=test_date, data=test_data, train_data_means=train_data_means, train_data_stds=train_data_stds, train_index=train_indices, test_index=test_indices)

Date:(3650,)	Data:(671, 3650, 26)	Means:(26,)	Stds:(26,)	Train_Index:(282,)	Test_Index:(94,)	Train
Date:(730,)	Data:(671, 730, 26)	Means:(26,)	Stds:(26,)	Train_Index:(282,)	Test_Index:(94,)	Valid
Date:(2920,)	Data:(671, 2920, 26)	Means:(26,)	Stds:(26,)	Train_Index:(282,)	Test_Index:(94,)	Test


# CREATE STRIDED DATA

In [9]:
def createStridedData(date, data):
	strided_data = np.zeros((data.shape[0], 2*(len(date)//config.window), config.window, data.shape[-1])).astype(np.float32)

	i = 0
	k = 0
	while i<len(date):
		strided_data[:, k] = data[:, i:i+config.window]
		k += 1

		if strided_data[:, k].shape == data[:, i+config.stride:i+config.stride+config.window].shape:
			strided_data[:, k] = data[:, i+config.stride:i+config.stride+config.window]
			k += 1

		i = i+config.window
	strided_data = strided_data[:, :k]
	return strided_data

strided_train_data = createStridedData(train_date, train_data)
strided_valid_data = createStridedData(valid_date, valid_data)
strided_test_data = createStridedData(test_date, test_data)

# SAVE STRIDED TRAIN & TEST DATA

In [10]:
print("Date:{}\tData:{}\tMeans:{}\tStds:{}\tTrain_Index:{}\tTest_Index:{}\tTrain".format(train_date.shape, strided_train_data.shape, train_data_means.shape, train_data_stds.shape, train_indices.shape, test_indices.shape))
np.savez(os.path.join(PREPROCESSED_DIR, "strided_train.npz"), date=train_date, data=strided_train_data, train_data_means=train_data_means, train_data_stds=train_data_stds, train_index=train_indices, test_index=test_indices)
print("Date:{}\tData:{}\tMeans:{}\tStds:{}\tTrain_Index:{}\tTest_Index:{}\tValid".format(valid_date.shape, strided_valid_data.shape, train_data_means.shape, train_data_stds.shape, train_indices.shape, test_indices.shape))
np.savez(os.path.join(PREPROCESSED_DIR, "strided_valid.npz"), date=valid_date, data=strided_valid_data, train_data_means=train_data_means, train_data_stds=train_data_stds, train_index=train_indices, test_index=test_indices)
print("Date:{}\tData:{}\tMeans:{}\tStds:{}\tTrain_Index:{}\tTest_Index:{}\tTest".format(test_date.shape, strided_test_data.shape, train_data_means.shape, train_data_stds.shape, train_indices.shape, test_indices.shape))
np.savez(os.path.join(PREPROCESSED_DIR, "strided_test.npz"), date=test_date, data=strided_test_data, train_data_means=train_data_means, train_data_stds=train_data_stds, train_index=train_indices, test_index=test_indices)

Date:(3650,)	Data:(671, 19, 365, 26)	Means:(26,)	Stds:(26,)	Train_Index:(282,)	Test_Index:(94,)	Train
Date:(730,)	Data:(671, 3, 365, 26)	Means:(26,)	Stds:(26,)	Train_Index:(282,)	Test_Index:(94,)	Valid
Date:(2920,)	Data:(671, 15, 365, 26)	Means:(26,)	Stds:(26,)	Train_Index:(282,)	Test_Index:(94,)	Test
