In [3]:
import yaml

class MyDumper(yaml.Dumper):

    def increase_indent(self, flow=False, indentless=False):
        return super(MyDumper, self).increase_indent(flow, False)

foo = {
    'name': 'foo',
    'my_list': [
        {'foo': 'test', 'bar': 'test2'},
        {'foo': 'test3', 'bar': 'test4'}],
    'hello': 'world',
}

print(yaml.dump(foo, Dumper=MyDumper, default_flow_style=False))

hello: world
my_list:
  - bar: test2
    foo: test
  - bar: test4
    foo: test3
name: foo



simple LSTM

In [None]:
import os
from models.LSTM import LSTM

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from utils import series_to_samples

# Initialize an empty array
# main_array = np.empty((0, 3))  # Adjust the shape depending on the dimensions of the arrays you're stacking

# # Simulating a loop where new data arrays are added
# for i in range(5):
#     # Generate some sample data (each array has 3 elements)
#     new_array = np.array([i, i + 1, i + 2])
    
#     # Reshape the new_array to make sure it has the same shape as the main_array for stacking
#     new_array = new_array.reshape(1, -1)
    
#     if i == 0:
#         print(main_array.shape)
#         print(new_array.shape)
#     # Stack the new array vertically
#     main_array = np.vstack([main_array, new_array])

# print(main_array)

def create_dataset(dataset, lookback):
    """Transform a time series into a prediction dataset
    
    Args:
        dataset: A numpy array of time series, first dimension is the time steps
        lookback: Size of window for prediction
    """
    X, y = [], []
    for i in range(len(dataset)-lookback):
        feature = dataset[i:i+lookback]
        target = dataset[i+1:i+lookback+1]
        X.append(feature)
        y.append(target)
    return torch.tensor(X), torch.tensor(y)

df = pd.read_csv('code/data/passengers.csv')
timeseries = df[["Passengers"]].values.astype('float32')

# train-test split for time series
train_size = int(len(timeseries) * 0.8)
test_size = len(timeseries) - train_size
train, test = timeseries[:train_size], timeseries[train_size:]

lookback = 6
X_train, y_train = create_dataset(train, lookback=lookback)
X_test, y_test = create_dataset(test, lookback=lookback)


print(X_train.shape)
# X_train, y_train, X_test, y_test = X_train.

print(X_train[:2])
print(y_train[:2])
# print(X_t[0:10])
# print(y_train)

class AirModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=1, hidden_size=50, num_layers=4, batch_first=True)
        self.linear = nn.Linear(50, 1)
    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.linear(x)
        return x

model = AirModel()
optimizer = optim.Adam(model.parameters(), weight_decay=0.1)
loss_fn = nn.MSELoss()
loader = data.DataLoader(data.TensorDataset(X_train, y_train), shuffle=True, batch_size=8)

n_epochs = 2000
for epoch in range(n_epochs):
    model.train()
    for X_batch, y_batch in loader:
        y_pred = model(X_batch)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # Validation
    if epoch % 100 != 0:
        continue
    model.eval()
    with torch.no_grad():
        y_pred = model(X_train)
        train_rmse = np.sqrt(loss_fn(y_pred, y_train))
        y_pred = model(X_test)
        test_rmse = np.sqrt(loss_fn(y_pred, y_test))
    print("Epoch %d: train RMSE %.4f, test RMSE %.4f" % (epoch, train_rmse, test_rmse))

with torch.no_grad():
    # shift train predictions for plotting
    train_plot = np.ones_like(timeseries) * np.nan
    y_pred = model(X_train)
    y_pred = y_pred[:, -1, :]
    train_plot[lookback:train_size] = model(X_train)[:, -1, :]
    # shift test predictions for plotting
    test_plot = np.ones_like(timeseries) * np.nan
    test_plot[train_size+lookback:len(timeseries)] = model(X_test)[:, -1, :]
# plot
plt.plot(timeseries)
plt.plot(train_plot, c='r')
plt.plot(test_plot, c='g')
plt.show()

test attribute files for missing rows

In [None]:
import os
import pandas as pd

from utils import get_files

def get_name(path):
    return os.path.splitext(os.path.basename(path))[0]

def check_folder(folder_name):
    data_files, _ = get_files(f'data/Caravan/timeseries/csv/{folder_name}')
    attribute_files, _ = get_files(f'data/Caravan/attributes/{folder_name}')

    data_files_names = list(map(get_name, data_files))

    for att_file in attribute_files:
        att_file = pd.read_csv(att_file)
        att_names = att_file['gauge_id'].to_list()

        i = 0
        while i < len(att_names):
            if att_names[i] != data_files_names[i]:
                print(f'mismatch at ({folder_name}, {att_file})')
                break
            i += 1

# print(get_name('data/Caravan/timeseries/csv/camelsaus'))
files, dirs = get_files('data/Caravan/timeseries/csv')

for dir in dirs:
    check_folder(dir)

check if all files have the same headers

In [None]:
import os
import pandas as pd

def check_csv_headers(directory, files):
    headers = None
    all_same = True
    mismatched_files = []

    # files = os.listdir(directory)
    print('checking ', len(files), ' files in ', directory)
    
    # Loop through all files in the directory
    for filename in files:

        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            try:
                # Read the CSV file
                df = pd.read_csv(file_path)
                # Get the headers (column names)
                current_headers = list(df.columns)

                # If it's the first file, store the headers
                if headers is None:
                    headers = current_headers
                else:
                    # Compare headers with the first file
                    if headers != current_headers:
                        all_same = False
                        mismatched_files.append(filename)
            
            except Exception as e:
                print(f"Error reading {filename}: {e}")
    
    # Report the result
    if all_same:
        # print("All CSV files have the same columns.")
        return headers
    else:
        # print("The following files have mismatched columns:", mismatched_files)
        return None
    
def check_nans(directory, files):
    # Loop through all files in the directory
    for filename in files:

        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            # print('in ', file_path)

            # Read the CSV file
            df = pd.read_csv(file_path)
            is_nan = df['streamflow'].isna().values
            j = 0
            for i in range(1, len(is_nan)):
                if is_nan[i-1] and not is_nan[i] and j > 0:
                    print('NaN break at [', j, ', ', i-1, '] in ', file_path)
                    # print('NaNs end at ', i)
                elif not is_nan[i-1] and is_nan[i]:
                    j = i
            # if not all(is_nan[i] >= is_nan[i - 1] for i in range(1, len(is_nan))):
            #     print('mixed NaNs in ', file_path)
                # return False


def check_all_csv_under_root(root, func: callable):
    headers = None
    all_same = True
    mismatched_folders = []

    for root, dirs, files in os.walk(root):
        if len(files) > 0:
            new_headers = func(root, files)
            if new_headers is None:
                print('folder ', root, ' has mismatched columns')
            else:
                print('folder ', root, ' has matched columns')
                if headers is None:
                    headers = new_headers
                else:
                    if headers != new_headers:
                        all_same = False
                        mismatched_folders.append(root)

    if all_same:
        print("All CSV files have the same columns.")

# Example usage
folder_path = "./data/Caravan/timeseries/csv/camelsaus"
check_all_csv_under_root(folder_path, check_nans)
