# Data processing

In [1]:
#import packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import os


The input in the model consists of the DEM and the first measurement of the waterdepth. In that way the UNET-model knows from what direction the water is coming. The model is trained on the training data set and tested on three different test datasets. The training data is splitted in validation and training part in another notebook. 

In order to run this notebook, all the datasets need to be downloaded and stored in the same folder as this notebook.

In [2]:
# Choose the testing data set 

test_dataset = 3 # can be 1, 2 or 3

# Define numbers for the test and training data. These refer to the names of the data files
# For the first two datasets, there are 97 time steps of 30 minutes, for the third dataset there are 241 time steps of 30 minutes



if test_dataset == 1:
    numbers_test = np.linspace(500, 519, 20, dtype=int)
    time = 97
    
if test_dataset == 2:
    numbers_test = np.linspace(10000, 10020, 21, dtype=int)
    time = 97

#we use only the first 98 time steps of dataset 3
if test_dataset == 3:
    numbers_test = np.linspace(15001, 15010, 10, dtype=int)
    time = 98
    
# The training data is the same for all datasets
numbers_train = np.linspace(1, 80, 80, dtype=int) 


In [13]:
#Define path to stored DEMs and waterdepths

base_path_DEM = 'C:/Users/emma1/Dropbox/Env Eng 23-24/DSAIE/Project/raw_datasets/raw_datasets/DEM'
base_path_WD = 'C:/Users/emma1/Dropbox/Env Eng 23-24/DSAIE/Project/raw_datasets/raw_datasets/WD'

output_folder = 'C:/Users/emma1/Dropbox/Env Eng 23-24/DSAIE/Project/raw_datasets/raw_datasets/Data_processed'

In [4]:
# Loading the DEMs and waterdepth data
test_DEM = []
test_WD = []
train_DEM = []
train_WD = []

for i in numbers_train:
    file_path_DEM_train = os.path.join(base_path_DEM, f"DEM_{i}.txt")
    DEM = np.loadtxt(file_path_DEM_train)
    train_DEM.append(DEM)
    
    file_path_WD_train = os.path.join(base_path_WD, f"WD_{i}.txt") 
    WD = np.loadtxt(file_path_WD_train)
    train_WD.append(WD)
    
for i in numbers_test:
    file_path_DEM_test = os.path.join(base_path_DEM, f"DEM_{i}.txt")
    DEM = np.loadtxt(file_path_DEM_test)    
    test_DEM.append(DEM)
     
    file_path_WD_test = os.path.join(base_path_WD, f"WD_{i}.txt") 
    WD = np.loadtxt(file_path_WD_test)
    test_WD.append(WD)


## Creating and reshaping training and testing data

With the function below, the input to the model is created. The input contains a tensor with the DEM and corresponing first water level for every time step.

In [5]:
def input_processing(DEM, WD, time):
    
    "Processing of DEM and WD to correct format as input"
    
    inputs = []
    
    for i in range(len(DEM)):

        input_val = []
        # Reshaping the DEM array to a 2D array
        x = np.unique([row[0] for row in DEM[i]])
        y = np.unique([row[1] for row in DEM[i]])
        z = np.array([row[2] for row in DEM[i]]).reshape(len(y), len(x))
    
        # Selecting the first time step with a non-zero water depth measurement as input
        WD_array = []
        
        for line in range(0, time):
            time_step = WD[i][line]
            
            if any(value != 0 for value in time_step):      # The timesteps with non-zero values are selected (start of the flooding event)
                WD_array.append(time_step.reshape(len(y), len(x)))
            
        # The first timestep with flooding is selected, this is an input in the model so it knows where the water is coming from        
        first_WD = WD_array[0] 
    
    
        # The DEMs and corresponding first waterdepths are combined in one tensor per location and stored in train_input
        input_val.append(z)
        input_val.append(first_WD)
        
        inputs.append(torch.tensor(input_val, dtype=torch.float32))
    return inputs
    
    
    

In [6]:
train_input = input_processing(train_DEM, train_WD, 97)
test_input = input_processing(test_DEM, test_WD, time)

  inputs.append(torch.tensor(input_val, dtype=torch.float32))


An example of the input is shown below, where a tensor is shown with the DEM corresponding first water level measured. In dataset 1, every flooding event starts in the top left corner. 

In [7]:
print(train_input[0])

tensor([[[ 0.0000, -0.4338, -0.9741,  ...,  1.3949,  1.1974,  0.8884],
         [ 0.3768, -0.0538, -0.5870,  ...,  1.3094,  0.9917,  0.5849],
         [ 0.7063,  0.2907, -0.2208,  ...,  1.1202,  0.6850,  0.1888],
         ...,
         [-0.1749, -0.3337, -0.6197,  ...,  1.1239,  1.0451,  0.7989],
         [-0.1829, -0.2944, -0.5222,  ...,  0.6289,  0.6465,  0.5163],
         [-0.1654, -0.2384, -0.4206,  ...,  0.2556,  0.3745,  0.3580]],

        [[ 0.0100,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]])


Below the targets for the training and testing data are computed. These consits of measured water depths at different time steps. The training dataset consists of 97 timesteps of 30 minutes. Where the first timestep is used as input

In [8]:
def create_targets(DEM, WD, time):
    
    "Create targets of the waterdepth (WD) in correct format"
    
    targets = []
    for i in range(len(WD)):
        array = []
       
        x = np.unique([row[0] for row in DEM[i]])
        y = np.unique([row[1] for row in DEM[i]])
        
        # The water depths at all the timesteps are reshaped and stored in an array
        for row in range(1, time):
            time_step = WD[i][row]
            array.append(time_step.reshape(len(y), len(x)))
    
        # A tensor is created (consisting of waterdepths at X different timesteps) and appended to the list of target tensors (targets)
        targets.append(torch.tensor(array, dtype=torch.float32))
    return targets
        

In [9]:
train_targets = create_targets(train_DEM, train_WD, 97)
test_targets = create_targets(test_DEM, test_WD, time)

print(len(train_targets[2]))

96


The training input and targets are combined in one training dataset: train. The same is done for the testing dataset: test

In [10]:
train = []

for dem, wd in zip(train_input, train_targets):
    train.append((dem, wd))

test = []

for dem, wd in zip(test_input,  test_targets):
    test.append((dem, wd))

In [11]:
print(len(test[0][1]))

97


The training and testing datasets can be stored with the code below.

In [16]:
test_name =  'test_dataset_3_CNN_time_two_inputs.pt' 

output_file_path_test = os.path.join(output_folder, test_name)
torch.save(test, output_file_path_test)


#train_name = #'train_dataset_CNN_time_two_inputs.pt'

#output_file_path_train = os.path.join(output_folder, train_name)
#torch.save(train, output_file_path_train)
