In [1]:
import os
import sys

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from models import new_models
from config import load_data

from ray import tune
from ray.air.integrations.mlflow import MLflowLoggerCallback
from ray.tune.schedulers import ASHAScheduler
from torch.optim.lr_scheduler import ReduceLROnPlateau

import mlflow
from mlflow.tracking import MlflowClient

In [2]:
ld = load_data(data_dir = "./data/", target_variable = 'Q_Kalltveit')
sequence_length = 25
batch_size = 200
vars = ['Nedbør Nilsebu']
X, y = ld.create_lagged_matrix(sequence_length, vars_to_lag=vars)

X_train, y_train, X_val, y_val, X_test, y_test = ld.split_data(X, y)

In [3]:
train_dataloader = ld.create_dataloader(X_train, y_train, sequence_length, batch_size=batch_size, shuffle=True)
val_dataloader = ld.create_dataloader(X_val, y_val, sequence_length, batch_size=batch_size, shuffle=True)
test_dataloader = ld.create_dataloader(X_test, y_test, sequence_length, batch_size=batch_size, shuffle=False)

In [4]:
for i, j in test_dataloader:
    print(i.shape)
    break

torch.Size([200, 25, 2])


In [5]:
def fit(net, loss_function, optimizer, data_loader, num_epochs, mode, lr_scheduler, use_amp=False):
	history = {"train": {"loss": [], "mae": []}, "val": {"loss": [], "mae": []}}
	scaler = torch.cuda.amp.GradScaler(enabled=use_amp) # Mixed-precision support for compatible GPUs
	print("\nTraining the model:")
	for epoch in range(num_epochs):
		print("\nEpoch", epoch+1)
		if epoch < num_epochs - 1:
			keys = ["train", "val"]
		else:
			keys = ["train", "val", "test"]
		for key in keys:
			dataset_size = 0
			dataset_loss = 0.0
			if key == "train":
				net.train()
			else:
				net.eval()
			for X_batch, y_batch in tqdm(data_loader[key]):
				X_batch, y_batch = X_batch.to(mode["device"]), y_batch.to(mode["device"])
				with torch.set_grad_enabled(mode=(key=="train")): # Autograd activated only during training
					with torch.cuda.amp.autocast(enabled=use_amp): # Mixed-precision support for compatible GPUs
						batch_output = net(X_batch.float())
						batch_loss = loss_function(batch_output, y_batch)
					if key == "train":
						scaler.scale(batch_loss).backward()
						scaler.step(optimizer) 	
						scaler.update()
						optimizer.zero_grad()
				dataset_size += y_batch.shape[0]
				dataset_loss += y_batch.shape[0] * batch_loss.item()
			dataset_loss /= dataset_size
			if key in ["train", "val"]:
				history[key]["loss"].append(dataset_loss)
				if key == "train":
					tune.report(train_loss=dataset_loss)
				else:
					val_loss = dataset_loss
					lr_scheduler.step(metrics=val_loss)
					tune.report(val_loss=val_loss)
			else:
				print("\nEvaluating the model:")
				print(key, "loss:", dataset_loss)
				tune.report(test_loss=dataset_loss)
	return net

In [6]:
from config import load_data

def train_model(config, data_dir):

    use_GPU = torch.cuda.is_available()
    if use_GPU:
        mode = {"name": "cuda", "device": torch.device("cuda")}
    else:
        mode = {"name": "cpu", "device": torch.device("cpu")}

    # Define hyperparameters
    train_size = 0.6
    val_size = 0.2
    test_size = 0.2

    sequence_length = config['sequence_length']
    batch_size = config['batch_size']
    num_epochs = config['num_epochs']
    lr = config['lr']
    weight_decay = config['weigth_decay']
    vars = ['Nedbør Nilsebu']

    ld = load_data(data_dir = data_dir, target_variable = config['target_variable'])
    
    X, y = ld.create_lagged_matrix(window_size=sequence_length, vars_to_lag=vars)

    X_train, y_train, X_val, y_val, X_test, y_test = ld.split_data(X, y)

    train_dataloader = ld.create_dataloader(X_train, y_train, sequence_length, batch_size=batch_size, shuffle=True)
    val_dataloader = ld.create_dataloader(X_val, y_val, sequence_length, batch_size=batch_size, shuffle=True)
    test_dataloader = ld.create_dataloader(X_test, y_test, sequence_length, batch_size=batch_size, shuffle=False)
    
    # Model inputs
    input_size = X_train.shape[-1]
    hidden_size = config['hidden_size']
    num_layers = config['num_layers']
    output_size = 1

    if config['arch'] == "FCN":
        net = new_models.FCN(input_size,
                    hidden_size,
                    num_layers,
                    output_size,
                    )
    elif config['arch'] ==  "FCNTemporalAttention":
        net = new_models.FCNTemporalAttention(input_size,
                    hidden_size,
                    num_layers,
                    output_size,
                    )
    elif config['arch'] == "LSTM":
        net = new_models.LSTM(input_size,
                    hidden_size,
                    num_layers,
                    output_size,
                    )
    elif config['arch'] == "LSTMTemporalAttention":
        net = new_models.LSTMTemporalAttention(input_size,
                    hidden_size,
                    num_layers,
                    output_size,
                    )

    data_loader = {
    "train": train_dataloader,
    "val": val_dataloader,
    "test": test_dataloader,
    }
    
    net.to(mode["device"])

    loss_function = nn.MSELoss().to(mode["device"])
    optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay)

    # Define your learning rate scheduler
    lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)
                                           
    scaler = torch.cuda.amp.GradScaler(enabled=False) # Mixed-precision support for compatible GPUs
    for epoch in range(num_epochs):
        print("\nEpoch", epoch+1)
        if epoch < num_epochs - 1:
            keys = ["train", "val"]
        else:
            keys = ["train", "val", "test"]
        for key in keys:
            dataset_size = 0
            dataset_loss = 0.0
            if key == "train":
                net.train()
            else:
                net.eval()
            for X_batch, y_batch in tqdm(data_loader[key]):
                X_batch, y_batch = X_batch.to(mode["device"]), y_batch.to(mode["device"])
                with torch.set_grad_enabled(mode=(key=="train")): # Autograd activated only during training
                    with torch.cuda.amp.autocast(enabled=False): # Mixed-precision support for compatible GPUs
                        batch_output = net(X_batch.float())
                        batch_loss = loss_function(batch_output, y_batch)
                    if key == "train":
                        scaler.scale(batch_loss).backward()
                        scaler.step(optimizer) 	
                        scaler.update()
                        optimizer.zero_grad()
                dataset_size += y_batch.shape[0]
                dataset_loss += y_batch.shape[0] * batch_loss.item()

            dataset_loss /= dataset_size

            # Report results to Ray Tune
            if key == "train":
                tune.report(train_loss=dataset_loss)
            elif key == "val":
                # Update learning rate
                lr_scheduler.step(metrics=dataset_loss)
                tune.report(val_loss=dataset_loss)
            else:
                print("\nEvaluating the model:")
                print(key, "loss:", dataset_loss)
                tune.report(test_loss=dataset_loss)

                
    out_name = ""
    for k, v in config.items():
        if not k in ['weights_dir', 'cwd']:
            out_name += '{}-{}_'.format(k, v)
    torch.save(net.state_dict(), os.path.join(config['cwd'], config['weights_dir'], out_name[:-1] + '.pth'))

In [7]:
client = MlflowClient()
cwd = os.getcwd()
exp_base_name = "Test_of_interface"

created = 0
for i in range(100):
    try:
        exp_name = exp_base_name+"_{}".format(i)
        experiment_id = client.create_experiment(exp_name)
        created=1
        break
    except (TypeError, mlflow.exceptions.MlflowException):
        continue

if not created:
    print("ERROR: Try new experiment name.")
    sys.exit(1)

weights_root = "./model_weights/"
weights_dir = weights_root+exp_name+'/'
os.mkdir(weights_dir)

In [8]:
data_dir = "./data/"
target_variable = 'Q_Kalltveit'

In [9]:
from functools import partial

config = {
    "mlflow_experiment_id": experiment_id,
    "weights_dir": weights_dir,
    "cwd": cwd,
    "target_variable": target_variable,
    "arch": tune.grid_search(["LSTM", "LSTMTemporalAttention"]), # "FCN", "FCNTemporalAttention", 
    "sequence_length": tune.grid_search([25, 40, 50, 60, 70]),
    'num_epochs': tune.grid_search([150]),
    'num_layers': tune.choice([2, 3, 4]),
    "lr": tune.loguniform(1e-4, 1e-1),
    "weigth_decay": tune.choice([0, 0.001, 0.0001]),
    "batch_size": tune.choice([256, 256*2]),
    "hidden_size": tune.grid_search([64]),
}
# ['Nedbør Nilsebu', 'Q_Lyngsaana']

analysis = tune.run(
    partial(train_model, data_dir=data_dir),
    config=config,
    resources_per_trial={"cpu": 12, "gpu": 1},
    num_samples=1,
    callbacks=[MLflowLoggerCallback(experiment_name=exp_name)],
)

0,1
Current time:,2023-03-10 19:02:10
Running for:,00:00:20.96
Memory:,17.2/31.9 GiB

Trial name,# failures,error file
train_model_a12f8_00000,1,"C:\Users\magnu\ray_results\train_model_2023-03-10_19-01-49\train_model_a12f8_00000_0_arch=LSTM,batch_size=512,hidden_size=64,lr=0.0047,num_epochs=150,num_layers=4,sequence_length=25,weigth__2023-03-10_19-01-49\error.txt"
train_model_a12f8_00001,1,"C:\Users\magnu\ray_results\train_model_2023-03-10_19-01-49\train_model_a12f8_00001_1_arch=LSTMTemporalAttention,batch_size=256,hidden_size=64,lr=0.0001,num_epochs=150,num_layers=2,sequence__2023-03-10_19-01-58\error.txt"

Trial name,status,loc,arch,batch_size,hidden_size,lr,num_epochs,num_layers,sequence_length,weigth_decay
train_model_a12f8_00002,RUNNING,127.0.0.1:24072,LSTM,256,64,0.00942057,150,3,40,0.001
train_model_a12f8_00003,PENDING,,LSTMTemporalAtt_c7b0,512,64,0.0497403,150,4,40,0.0
train_model_a12f8_00004,PENDING,,LSTM,512,64,0.0837472,150,2,50,0.001
train_model_a12f8_00005,PENDING,,LSTMTemporalAtt_c7b0,512,64,0.00902571,150,3,50,0.001
train_model_a12f8_00006,PENDING,,LSTM,256,64,0.000160153,150,4,60,0.0
train_model_a12f8_00007,PENDING,,LSTMTemporalAtt_c7b0,256,64,0.00402484,150,4,60,0.001
train_model_a12f8_00008,PENDING,,LSTM,512,64,0.0777629,150,4,70,0.0
train_model_a12f8_00009,PENDING,,LSTMTemporalAtt_c7b0,512,64,0.0366631,150,4,70,0.0001
train_model_a12f8_00000,ERROR,127.0.0.1:23596,LSTM,512,64,0.00473555,150,4,25,0.0
train_model_a12f8_00001,ERROR,127.0.0.1:24484,LSTMTemporalAtt_c7b0,256,64,0.000108066,150,2,25,0.0001
