In [1]:
import torch.nn as nn
import torch.optim as optim
from classes import SpeedEstimatorRNN, VehicleSpeedDataset, SpeedEstimatorLSTM, SpeedEstimatorGRU
from torch.utils.data import DataLoader
import torch
import torch.onnx

In [2]:
if torch.cuda.is_available():
    print("CUDA is available! You can use a GPU for training.")
    print("Number of GPUs available:", torch.cuda.device_count())
    print("Current GPU being used:", torch.cuda.current_device())
    print("GPU Name:", torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print("CUDA is not available. Training will be performed on the CPU.")

CUDA is available! You can use a GPU for training.
Number of GPUs available: 1
Current GPU being used: 0
GPU Name: NVIDIA GeForce RTX 3050 Laptop GPU


In [3]:
# Set dataset path
training_data_path = "data/i7/it_1/1_training"
extension = "*.csv"

test_data_path = "data/i7/it_1/2_testing"

# Hyperparameters that will alter throughout the model creations
input_size = 20  # Number of CAN signals per timestep
hidden_size = [128, 256, 512, 512, 768]
num_layers = [2, 3, 2, 3, 3]
learning_rate = [0.0001] * 5
# num of sequences in one batch
batch_size = [128] * 5
dropout_rate = [0.2] * 5
sequence_length = [800, 800, 800, 800, 1000]


# parameters of the simulation
step_size = 10 # what the overlap between the sequences should look like in the extracted dataset
output_size = 2
num_epochs = 35

num_models = 5

location_state = "Simple RNN/trained_models/i7/it_1/state_models/model_"
location_traced = "Simple RNN/trained_models/i7/it_1/traced_models/model_"

location_state_LSTM = "LSTM/trained_models/i7/it_1/state_models/model_LSTM_"
location_traced_LSTM = "LSTM/trained_models/i7/it_1/traced_models/model_LSTM_"

location_state_GRU = "GRU/trained_models/i7/it_2/state_models/model_GRU_"
location_traced_GRU = "GRU/trained_models/i7/it_2/traced_models/model_GRU_"

RNN

In [None]:
# Initialize variables to track the best test/validation loss
patience = 5

# Training loops
for j in range(num_models):

    early_stopping_counter = 0
    best_test_loss = float('inf')

    print("-------------------------------------")
    print(f"Training has started for simple RNN model {j}")

    # Load dataset and DataLoader
    train_dataset = VehicleSpeedDataset(training_data_path, extension, seq_length = sequence_length[j], step_size = step_size)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size[j], shuffle=True, num_workers= 6, pin_memory=True)

    # Load test dataset and DataLoader
    test_dataset = VehicleSpeedDataset(test_data_path, extension, seq_length=sequence_length[j], step_size=step_size)
    test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)  # Batch size = 1 for test evaluation


    # Initialize model, loss function, and optimizer
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SpeedEstimatorRNN(input_size, hidden_size[j], num_layers[j], output_size).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate[j])

    example_input = torch.rand(1, sequence_length[j], input_size).to(device)  # Example input matching model dimensions

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0

        for batch_idx, (features, speeds) in enumerate(train_dataloader):
            speeds = speeds.squeeze(1)  # Remove extra dimension from speeds if present
            features, speeds = features.to(device), speeds.to(device)

            # Forward pass
            outputs = model(features)

            assert outputs.shape == speeds.shape, f"Shape mismatch: outputs {outputs.shape} vs speeds {speeds.shape}"

            train_loss = criterion(outputs, speeds)

            # Backward pass
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()

            total_train_loss += train_loss.item()

        print(f"Model: {j}, Epoch [{epoch+1}/{num_epochs}], Loss: {total_train_loss/len(train_dataloader):.4f}")

        model.eval()
        total_test_loss = 0

        with torch.no_grad():  # No need to compute gradients for validation/test
            for features, speeds in test_dataloader:
                speeds = speeds.squeeze(1)
                features, speeds = features.to(device), speeds.to(device)

                # Forward pass
                test_outputs = model(features)
                test_loss = criterion(test_outputs, speeds)

                total_test_loss += test_loss.item()

        avg_test_loss = total_test_loss / len(test_dataloader)

        print(f"Model: {j}, Epoch [{epoch+1}/{num_epochs}], Test Loss: {avg_test_loss:.4f}")

         # Checkpoint: Save model if test loss improves
        if avg_test_loss < best_test_loss:
            print(f"New best model found! Test Loss improved from {best_test_loss:.4f} to {avg_test_loss:.4f}")
            best_test_loss = avg_test_loss
            early_stopping_counter = 0

            # Save model state and optimizer state
            torch.save({
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "sequence_length": sequence_length,
                "input_size": input_size,
                "hidden_size": hidden_size,
                "num_layers": num_layers,
                "output_size": output_size,
                "learning_rate": learning_rate,
                "num_epochs": num_epochs
            }, location_state + str(j) + ".pt")

            # Save traced model for MATLAB -> taken out
            """
            traced_model = torch.jit.trace(model, example_input)
            torch.jit.save(traced_model, location_traced + str(j) + "_traced.pt")  # Save as traced TorchScript model

            traced_model.save(location_traced + str(j) + "_traced_simple_save.pt")  # Save as traced TorchScript model
            """


            # Export model to ONNX
            onnx_model_path = location_traced + str(j) + "_traced.onnx"

            torch.onnx.export(
                model,                     # PyTorch model
                example_input,             # Example input (same as used for tracing)
                onnx_model_path,           # Output filename
                export_params=True,
                opset_version=11,          # MATLAB supports up to opset 11/12 reliably
                do_constant_folding=True,
                input_names=['input'],
                output_names=['output'],
                dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
            )

            print(f"all model_{j} saved")
        else:
            early_stopping_counter += 1
            print(f"Test loss has not improved; early stopping counter: {early_stopping_counter}")

        if early_stopping_counter >= patience:
            print("Early stopping triggered -> starting next model!")
            print("------------------------------------------------")
            break  # Exit the training loop early

    if early_stopping_counter < patience:
        print("We're out of epochs but patience limit has not been reached -> starting next model!")
        print("-----------------------------------------------------------------------------------")

LSTM

In [None]:
# Initialize variables to track the best test/validation loss
patience = 5

# Training loops
for j in range(1, num_models):

    early_stopping_counter = 0
    best_test_loss = float('inf')

    print("-------------------------------------")
    print(f"Training has started for LSTM model {j}")

    # Load dataset and DataLoader
    train_dataset = VehicleSpeedDataset(training_data_path, extension, seq_length = sequence_length[j], step_size = step_size)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size[j], shuffle=True, num_workers= 6, pin_memory=True)

    # Load test dataset and DataLoader
    test_dataset = VehicleSpeedDataset(test_data_path, extension, seq_length=sequence_length[j], step_size=step_size)
    test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)  # Batch size = 1 for test evaluation


    # Initialize model, loss function, and optimizer
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SpeedEstimatorLSTM(input_size, hidden_size[j], num_layers[j], output_size).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate[j])

    example_input = torch.rand(1, sequence_length[j], input_size).to(device)  # Example input matching model dimensions

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0

        for batch_idx, (features, speeds) in enumerate(train_dataloader):
            speeds = speeds.squeeze(1)  # Remove extra dimension from speeds if present
            features, speeds = features.to(device), speeds.to(device)

            # Forward pass
            outputs = model(features)

            assert outputs.shape == speeds.shape, f"Shape mismatch: outputs {outputs.shape} vs speeds {speeds.shape}"

            train_loss = criterion(outputs, speeds)

            # Backward pass
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()

            total_train_loss += train_loss.item()

        print(f"Model: {j}, Epoch [{epoch+1}/{num_epochs}], Loss: {total_train_loss/len(train_dataloader):.4f}")

        model.eval()
        total_test_loss = 0

        with torch.no_grad():  # No need to compute gradients for validation/test
            for features, speeds in test_dataloader:
                speeds = speeds.squeeze(1)
                features, speeds = features.to(device), speeds.to(device)

                # Forward pass
                test_outputs = model(features)
                test_loss = criterion(test_outputs, speeds)

                total_test_loss += test_loss.item()

        avg_test_loss = total_test_loss / len(test_dataloader)

        print(f"Model: {j}, Epoch [{epoch+1}/{num_epochs}], Test Loss: {avg_test_loss:.4f}")

         # Checkpoint: Save model if test loss improves
        if avg_test_loss < best_test_loss:
            print(f"New best model found! Test Loss improved from {best_test_loss:.4f} to {avg_test_loss:.4f}")
            best_test_loss = avg_test_loss
            early_stopping_counter = 0

            # Save model state and optimizer state
            torch.save({
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "sequence_length": sequence_length,
                "input_size": input_size,
                "hidden_size": hidden_size,
                "num_layers": num_layers,
                "output_size": output_size,
                "learning_rate": learning_rate,
                "num_epochs": num_epochs
            }, location_state_LSTM + str(j) + ".pt")

            # Save traced model for MATLAB -> taken out
            """
            traced_model = torch.jit.trace(model, example_input)
            torch.jit.save(traced_model, location_traced + str(j) + "_traced.pt")  # Save as traced TorchScript model

            traced_model.save(location_traced + str(j) + "_traced_simple_save.pt")  # Save as traced TorchScript model
            """


            # Export model to ONNX
            onnx_model_path = location_traced_LSTM + str(j) + "_traced.onnx"

            torch.onnx.export(
                model,                     # PyTorch model
                example_input,             # Example input (same as used for tracing)
                onnx_model_path,           # Output filename
                export_params=True,
                opset_version=11,          # MATLAB supports up to opset 11/12 reliably
                do_constant_folding=True,
                input_names=['input'],
                output_names=['output'],
                dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
            )

            print(f"all model_{j} saved")
        else:
            early_stopping_counter += 1
            print(f"Test loss has not improved; early stopping counter: {early_stopping_counter}")

        if early_stopping_counter >= patience:
            print("Early stopping triggered -> starting next model!")
            print("------------------------------------------------")
            break  # Exit the training loop early

    if early_stopping_counter < patience:
        print("We're out of epochs but patience limit has not been reached -> starting next model!")
        print("-----------------------------------------------------------------------------------")

GRU

In [6]:
# Initialize variables to track the best test/validation loss
patience = 5

# Training loops
for j in range(1, num_models):

    early_stopping_counter = 0
    best_test_loss = float('inf')

    print("-------------------------------------")
    print(f"Training has started for GRU model {j}")

    # Load dataset and DataLoader
    train_dataset = VehicleSpeedDataset(training_data_path, extension, seq_length = sequence_length[j], step_size = step_size)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size[j], shuffle=True, num_workers= 6, pin_memory=True)

    # Load test dataset and DataLoader
    test_dataset = VehicleSpeedDataset(test_data_path, extension, seq_length=sequence_length[j], step_size=step_size)
    test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)  # Batch size = 1 for test evaluation


    # Initialize model, loss function, and optimizer
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SpeedEstimatorGRU(input_size, hidden_size[j], num_layers[j], output_size).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate[j])

    example_input = torch.rand(1, sequence_length[j], input_size).to(device)  # Example input matching model dimensions

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0

        print("-------------------------------------")
        print(f"Epoch [{epoch+1}] has started")

        for batch_idx, (features, speeds) in enumerate(train_dataloader):
            speeds = speeds.squeeze(1)  # Remove extra dimension from speeds if present
            features, speeds = features.to(device), speeds.to(device)

            # Forward pass
            outputs = model(features)

            assert outputs.shape == speeds.shape, f"Shape mismatch: outputs {outputs.shape} vs speeds {speeds.shape}"

            train_loss = criterion(outputs, speeds)

            # Backward pass
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()

            total_train_loss += train_loss.item()

        print(f"Model: {j}, Epoch [{epoch+1}/{num_epochs}], Loss: {total_train_loss/len(train_dataloader):.4f}")

        model.eval()
        total_test_loss = 0

        with torch.no_grad():  # No need to compute gradients for validation/test
            for features, speeds in test_dataloader:
                speeds = speeds.squeeze(1)
                features, speeds = features.to(device), speeds.to(device)

                # Forward pass
                test_outputs = model(features)
                test_loss = criterion(test_outputs, speeds)

                total_test_loss += test_loss.item()

        avg_test_loss = total_test_loss / len(test_dataloader)

        print(f"Model: {j}, Epoch [{epoch+1}/{num_epochs}], Test Loss: {avg_test_loss:.4f}")

         # Checkpoint: Save model if test loss improves
        if avg_test_loss < best_test_loss:
            print(f"New best model found! Test Loss improved from {best_test_loss:.4f} to {avg_test_loss:.4f}")
            best_test_loss = avg_test_loss
            early_stopping_counter = 0

            # Save model state and optimizer state
            torch.save({
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "sequence_length": sequence_length,
                "input_size": input_size,
                "hidden_size": hidden_size,
                "num_layers": num_layers,
                "output_size": output_size,
                "learning_rate": learning_rate,
                "num_epochs": num_epochs
            }, location_state_GRU + str(j) + ".pt")
            print("model " + location_state_GRU + str(j) + ".pt" + " saved")

            # Save traced model for MATLAB -> taken out

            traced_model = torch.jit.trace(model, example_input)
            torch.jit.save(traced_model, location_traced_GRU + str(j) + "_traced_jit_save.pt")  # Save as traced TorchScript model
            print("model " + location_traced_GRU + str(j) + "_traced_jit_save.pt" + " saved")

            traced_model.save(location_traced_GRU + str(j) + "_traced_simple_save.pt")  # Save as traced TorchScript model
            print("model " + location_traced_GRU + str(j) + "_traced_simple_save.pt" + " saved")



            # Export model to ONNX
            onnx_model_path = location_traced_GRU + str(j) + "_traced.onnx"

            torch.onnx.export(
                model,                     # PyTorch model
                example_input,             # Example input (same as used for tracing)
                onnx_model_path,           # Output filename
                export_params=True,
                opset_version=11,          # MATLAB supports up to opset 11/12 reliably
                do_constant_folding=True,
                input_names=['input'],
                output_names=['output'],
                dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
            )
            print("model " + location_traced_GRU + str(j) + "_traced.onnx" + " saved")
            print("---------------------")
            print(f"all model_{j} saved")
        else:
            early_stopping_counter += 1
            print(f"Test loss has not improved; early stopping counter: {early_stopping_counter}")

        if early_stopping_counter >= patience:
            print("Early stopping triggered -> starting next model!")
            print("------------------------------------------------")
            break  # Exit the training loop early

    if early_stopping_counter < patience:
        print("We're out of epochs but patience limit has not been reached -> starting next model!")
        print("-----------------------------------------------------------------------------------")

-------------------------------------
Training has started for GRU model 1
-------------------------------------
Epoch [1] has started
Model: 1, Epoch [1/35], Loss: 12.6353
Model: 1, Epoch [1/35], Test Loss: 6.8296
New best model found! Test Loss improved from inf to 6.8296
model GRU/trained_models/i7/it_2/state_models/model_GRU_1.pt saved
model GRU/trained_models/i7/it_2/traced_models/model_GRU_1_traced_jit_save.pt saved
model GRU/trained_models/i7/it_2/traced_models/model_GRU_1_traced_simple_save.pt saved




verbose: False, log level: Level.ERROR

model GRU/trained_models/i7/it_2/traced_models/model_GRU_1_traced.onnx saved
---------------------
all model_1 saved
-------------------------------------
Epoch [2] has started
Model: 1, Epoch [2/35], Loss: 3.8230
Model: 1, Epoch [2/35], Test Loss: 3.5771
New best model found! Test Loss improved from 6.8296 to 3.5771
model GRU/trained_models/i7/it_2/state_models/model_GRU_1.pt saved
model GRU/trained_models/i7/it_2/traced_models/model_GRU_1_traced_jit_save.pt saved
model GRU/trained_models/i7/it_2/traced_models/model_GRU_1_traced_simple_save.pt saved
verbose: False, log level: Level.ERROR

model GRU/trained_models/i7/it_2/traced_models/model_GRU_1_traced.onnx saved
---------------------
all model_1 saved
-------------------------------------
Epoch [3] has started




Model: 1, Epoch [3/35], Loss: 2.1608
Model: 1, Epoch [3/35], Test Loss: 2.2089
New best model found! Test Loss improved from 3.5771 to 2.2089
model GRU/trained_models/i7/it_2/state_models/model_GRU_1.pt saved
model GRU/trained_models/i7/it_2/traced_models/model_GRU_1_traced_jit_save.pt saved
model GRU/trained_models/i7/it_2/traced_models/model_GRU_1_traced_simple_save.pt saved
verbose: False, log level: Level.ERROR

model GRU/trained_models/i7/it_2/traced_models/model_GRU_1_traced.onnx saved
---------------------
all model_1 saved
-------------------------------------
Epoch [4] has started




Model: 1, Epoch [4/35], Loss: 1.3284
Model: 1, Epoch [4/35], Test Loss: 1.3691
New best model found! Test Loss improved from 2.2089 to 1.3691
model GRU/trained_models/i7/it_2/state_models/model_GRU_1.pt saved
model GRU/trained_models/i7/it_2/traced_models/model_GRU_1_traced_jit_save.pt saved
model GRU/trained_models/i7/it_2/traced_models/model_GRU_1_traced_simple_save.pt saved
verbose: False, log level: Level.ERROR

model GRU/trained_models/i7/it_2/traced_models/model_GRU_1_traced.onnx saved
---------------------
all model_1 saved
-------------------------------------
Epoch [5] has started




Model: 1, Epoch [5/35], Loss: 0.8319
Model: 1, Epoch [5/35], Test Loss: 0.8464
New best model found! Test Loss improved from 1.3691 to 0.8464
model GRU/trained_models/i7/it_2/state_models/model_GRU_1.pt saved
model GRU/trained_models/i7/it_2/traced_models/model_GRU_1_traced_jit_save.pt saved
model GRU/trained_models/i7/it_2/traced_models/model_GRU_1_traced_simple_save.pt saved
verbose: False, log level: Level.ERROR

model GRU/trained_models/i7/it_2/traced_models/model_GRU_1_traced.onnx saved
---------------------
all model_1 saved
-------------------------------------
Epoch [6] has started




KeyboardInterrupt: 