In [1]:
import torch.nn as nn
import torch.optim as optim
from classes import SpeedEstimatorRNN, VehicleSpeedDataset, SpeedEstimatorLSTM
from torch.utils.data import DataLoader
import torch
import torch.onnx

In [2]:
if torch.cuda.is_available():
    print("CUDA is available! You can use a GPU for training.")
    print("Number of GPUs available:", torch.cuda.device_count())
    print("Current GPU being used:", torch.cuda.current_device())
    print("GPU Name:", torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print("CUDA is not available. Training will be performed on the CPU.")

CUDA is available! You can use a GPU for training.
Number of GPUs available: 1
Current GPU being used: 0
GPU Name: NVIDIA GeForce RTX 3050 Laptop GPU


In [3]:
# Set dataset path
training_data_path = "data/i7/it_1/1_training"
extension = "*.csv"

test_data_path = "data/i7/it_1/2_testing"

# Hyperparameters that will alter throughout the model creations
input_size = 20  # Number of CAN signals per timestep
hidden_size = [256, 256, 512, 512, 768]
num_layers = [3, 4, 3, 4, 3]
learning_rate = [0.0001] * 5
# num of sequences in one batch
batch_size = [128] * 5
dropout_rate = [0.2] * 5
sequence_length = [800, 800, 800, 800, 1000]


# parameters of the simulation
step_size = 10 # what the overlap between the sequences should look like in the extracted dataset
output_size = 2
num_epochs = 30

num_models = 5

location_state = "Simple RNN/trained_models/i7/it_1/state_models/model_"
location_traced = "Simple RNN/trained_models/i7/it_1/traced_models/model_"

location_state_LSTM = "LSTM/trained_models/i7/it_1/state_models/model_LSTM_"
location_traced_LSTM = "LSTM/trained_models/i7/it_1/traced_models/model_LSTM_"

RNN

In [None]:
# Initialize variables to track the best test/validation loss
patience = 5

# Training loops
for j in range(num_models):

    early_stopping_counter = 0
    best_test_loss = float('inf')

    print("-------------------------------------")
    print(f"Training has started for model {j}")

    # Load dataset and DataLoader
    train_dataset = VehicleSpeedDataset(training_data_path, extension, seq_length = sequence_length[j], step_size = step_size)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size[j], shuffle=True, num_workers= 6, pin_memory=True)

    # Load test dataset and DataLoader
    test_dataset = VehicleSpeedDataset(test_data_path, extension, seq_length=sequence_length[j], step_size=step_size)
    test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)  # Batch size = 1 for test evaluation


    # Initialize model, loss function, and optimizer
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SpeedEstimatorRNN(input_size, hidden_size[j], num_layers[j], output_size).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate[j])

    example_input = torch.rand(1, sequence_length[j], input_size).to(device)  # Example input matching model dimensions

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0

        for batch_idx, (features, speeds) in enumerate(train_dataloader):
            speeds = speeds.squeeze(1)  # Remove extra dimension from speeds if present
            features, speeds = features.to(device), speeds.to(device)

            # Forward pass
            outputs = model(features)

            assert outputs.shape == speeds.shape, f"Shape mismatch: outputs {outputs.shape} vs speeds {speeds.shape}"

            train_loss = criterion(outputs, speeds)

            # Backward pass
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()

            total_train_loss += train_loss.item()

        print(f"Model: {j}, Epoch [{epoch+1}/{num_epochs}], Loss: {total_train_loss/len(train_dataloader):.4f}")

        model.eval()
        total_test_loss = 0

        with torch.no_grad():  # No need to compute gradients for validation/test
            for features, speeds in test_dataloader:
                speeds = speeds.squeeze(1)
                features, speeds = features.to(device), speeds.to(device)

                # Forward pass
                test_outputs = model(features)
                test_loss = criterion(test_outputs, speeds)

                total_test_loss += test_loss.item()

        avg_test_loss = total_test_loss / len(test_dataloader)

        print(f"Model: {j}, Epoch [{epoch+1}/{num_epochs}], Test Loss: {avg_test_loss:.4f}")

         # Checkpoint: Save model if test loss improves
        if avg_test_loss < best_test_loss:
            print(f"New best model found! Test Loss improved from {best_test_loss:.4f} to {avg_test_loss:.4f}")
            best_test_loss = avg_test_loss
            early_stopping_counter = 0

            # Save model state and optimizer state
            torch.save({
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "sequence_length": sequence_length,
                "input_size": input_size,
                "hidden_size": hidden_size,
                "num_layers": num_layers,
                "output_size": output_size,
                "learning_rate": learning_rate,
                "num_epochs": num_epochs
            }, location_state + str(j) + ".pt")

            # Save traced model for MATLAB -> taken out
            """
            traced_model = torch.jit.trace(model, example_input)
            torch.jit.save(traced_model, location_traced + str(j) + "_traced.pt")  # Save as traced TorchScript model

            traced_model.save(location_traced + str(j) + "_traced_simple_save.pt")  # Save as traced TorchScript model
            """


            # Export model to ONNX
            onnx_model_path = location_traced + str(j) + "_traced.onnx"

            torch.onnx.export(
                model,                     # PyTorch model
                example_input,             # Example input (same as used for tracing)
                onnx_model_path,           # Output filename
                export_params=True,
                opset_version=11,          # MATLAB supports up to opset 11/12 reliably
                do_constant_folding=True,
                input_names=['input'],
                output_names=['output'],
                dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
            )

            print(f"all model_{j} saved")
        else:
            early_stopping_counter += 1
            print(f"Test loss has not improved; early stopping counter: {early_stopping_counter}")

        if early_stopping_counter >= patience:
            print("Early stopping triggered -> starting next model!")
            print("------------------------------------------------")
            break  # Exit the training loop early

    if early_stopping_counter < patience:
        print("We're out of epochs but patience limit has not been reached -> starting next model!")
        print("-----------------------------------------------------------------------------------")

LSTM

In [5]:
# Initialize variables to track the best test/validation loss
patience = 5

# Training loops
for j in range(num_models):

    early_stopping_counter = 0
    best_test_loss = float('inf')

    print("-------------------------------------")
    print(f"Training has started for model {j}")

    # Load dataset and DataLoader
    train_dataset = VehicleSpeedDataset(training_data_path, extension, seq_length = sequence_length[j], step_size = step_size)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size[j], shuffle=True, num_workers= 6, pin_memory=True)

    # Load test dataset and DataLoader
    test_dataset = VehicleSpeedDataset(test_data_path, extension, seq_length=sequence_length[j], step_size=step_size)
    test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)  # Batch size = 1 for test evaluation


    # Initialize model, loss function, and optimizer
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SpeedEstimatorLSTM(input_size, hidden_size[j], num_layers[j], output_size).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate[j])

    example_input = torch.rand(1, sequence_length[j], input_size).to(device)  # Example input matching model dimensions

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0

        for batch_idx, (features, speeds) in enumerate(train_dataloader):
            speeds = speeds.squeeze(1)  # Remove extra dimension from speeds if present
            features, speeds = features.to(device), speeds.to(device)

            # Forward pass
            outputs = model(features)

            assert outputs.shape == speeds.shape, f"Shape mismatch: outputs {outputs.shape} vs speeds {speeds.shape}"

            train_loss = criterion(outputs, speeds)

            # Backward pass
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()

            total_train_loss += train_loss.item()

        print(f"Model: {j}, Epoch [{epoch+1}/{num_epochs}], Loss: {total_train_loss/len(train_dataloader):.4f}")

        model.eval()
        total_test_loss = 0

        with torch.no_grad():  # No need to compute gradients for validation/test
            for features, speeds in test_dataloader:
                speeds = speeds.squeeze(1)
                features, speeds = features.to(device), speeds.to(device)

                # Forward pass
                test_outputs = model(features)
                test_loss = criterion(test_outputs, speeds)

                total_test_loss += test_loss.item()

        avg_test_loss = total_test_loss / len(test_dataloader)

        print(f"Model: {j}, Epoch [{epoch+1}/{num_epochs}], Test Loss: {avg_test_loss:.4f}")

         # Checkpoint: Save model if test loss improves
        if avg_test_loss < best_test_loss:
            print(f"New best model found! Test Loss improved from {best_test_loss:.4f} to {avg_test_loss:.4f}")
            best_test_loss = avg_test_loss
            early_stopping_counter = 0

            # Save model state and optimizer state
            torch.save({
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "sequence_length": sequence_length,
                "input_size": input_size,
                "hidden_size": hidden_size,
                "num_layers": num_layers,
                "output_size": output_size,
                "learning_rate": learning_rate,
                "num_epochs": num_epochs
            }, location_state_LSTM + str(j) + ".pt")

            # Save traced model for MATLAB -> taken out
            """
            traced_model = torch.jit.trace(model, example_input)
            torch.jit.save(traced_model, location_traced + str(j) + "_traced.pt")  # Save as traced TorchScript model

            traced_model.save(location_traced + str(j) + "_traced_simple_save.pt")  # Save as traced TorchScript model
            """


            # Export model to ONNX
            onnx_model_path = location_traced_LSTM + str(j) + "_traced.onnx"

            torch.onnx.export(
                model,                     # PyTorch model
                example_input,             # Example input (same as used for tracing)
                onnx_model_path,           # Output filename
                export_params=True,
                opset_version=11,          # MATLAB supports up to opset 11/12 reliably
                do_constant_folding=True,
                input_names=['input'],
                output_names=['output'],
                dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
            )

            print(f"all model_{j} saved")
        else:
            early_stopping_counter += 1
            print(f"Test loss has not improved; early stopping counter: {early_stopping_counter}")

        if early_stopping_counter >= patience:
            print("Early stopping triggered -> starting next model!")
            print("------------------------------------------------")
            break  # Exit the training loop early

    if early_stopping_counter < patience:
        print("We're out of epochs but patience limit has not been reached -> starting next model!")
        print("-----------------------------------------------------------------------------------")

-------------------------------------
Training has started for model 0
Model: 0, Epoch [1/30], Loss: 14.7596
Model: 0, Epoch [1/30], Test Loss: 6.6731
New best model found! Test Loss improved from inf to 6.6731
all model_0 saved




Model: 0, Epoch [2/30], Loss: 3.7189
Model: 0, Epoch [2/30], Test Loss: 3.5021
New best model found! Test Loss improved from 6.6731 to 3.5021
all model_0 saved




Model: 0, Epoch [3/30], Loss: 2.0570
Model: 0, Epoch [3/30], Test Loss: 2.1332
New best model found! Test Loss improved from 3.5021 to 2.1332




all model_0 saved
Model: 0, Epoch [4/30], Loss: 1.2617
Model: 0, Epoch [4/30], Test Loss: 1.2712
New best model found! Test Loss improved from 2.1332 to 1.2712
all model_0 saved




Model: 0, Epoch [5/30], Loss: 0.7557
Model: 0, Epoch [5/30], Test Loss: 0.7590
New best model found! Test Loss improved from 1.2712 to 0.7590




all model_0 saved
Model: 0, Epoch [6/30], Loss: 0.4573
Model: 0, Epoch [6/30], Test Loss: 0.4628
New best model found! Test Loss improved from 0.7590 to 0.4628
all model_0 saved




Model: 0, Epoch [7/30], Loss: 0.2722
Model: 0, Epoch [7/30], Test Loss: 0.2909
New best model found! Test Loss improved from 0.4628 to 0.2909
all model_0 saved




Model: 0, Epoch [8/30], Loss: 0.1691
Model: 0, Epoch [8/30], Test Loss: 0.1848
New best model found! Test Loss improved from 0.2909 to 0.1848
all model_0 saved




Model: 0, Epoch [9/30], Loss: 0.1127
Model: 0, Epoch [9/30], Test Loss: 0.1200
New best model found! Test Loss improved from 0.1848 to 0.1200
all model_0 saved




Model: 0, Epoch [10/30], Loss: 0.0747
Model: 0, Epoch [10/30], Test Loss: 0.0936
New best model found! Test Loss improved from 0.1200 to 0.0936




all model_0 saved
Model: 0, Epoch [11/30], Loss: 0.0501
Model: 0, Epoch [11/30], Test Loss: 0.0726
New best model found! Test Loss improved from 0.0936 to 0.0726




all model_0 saved
Model: 0, Epoch [12/30], Loss: 0.0379
Model: 0, Epoch [12/30], Test Loss: 0.0731
Test loss has not improved; early stopping counter: 1
Model: 0, Epoch [13/30], Loss: 0.0268
Model: 0, Epoch [13/30], Test Loss: 0.0547
New best model found! Test Loss improved from 0.0726 to 0.0547
all model_0 saved




Model: 0, Epoch [14/30], Loss: 0.0237
Model: 0, Epoch [14/30], Test Loss: 0.0462
New best model found! Test Loss improved from 0.0547 to 0.0462
all model_0 saved




Model: 0, Epoch [15/30], Loss: 0.0192
Model: 0, Epoch [15/30], Test Loss: 0.0454
New best model found! Test Loss improved from 0.0462 to 0.0454
all model_0 saved




Model: 0, Epoch [16/30], Loss: 0.0163
Model: 0, Epoch [16/30], Test Loss: 0.0523
Test loss has not improved; early stopping counter: 1
Model: 0, Epoch [17/30], Loss: 0.0148
Model: 0, Epoch [17/30], Test Loss: 0.0488
Test loss has not improved; early stopping counter: 2
Model: 0, Epoch [18/30], Loss: 0.0147
Model: 0, Epoch [18/30], Test Loss: 0.0416
New best model found! Test Loss improved from 0.0454 to 0.0416
all model_0 saved




Model: 0, Epoch [19/30], Loss: 0.0135
Model: 0, Epoch [19/30], Test Loss: 0.0450
Test loss has not improved; early stopping counter: 1
Model: 0, Epoch [20/30], Loss: 0.0133
Model: 0, Epoch [20/30], Test Loss: 0.0427
Test loss has not improved; early stopping counter: 2
Model: 0, Epoch [21/30], Loss: 0.0095
Model: 0, Epoch [21/30], Test Loss: 0.0410
New best model found! Test Loss improved from 0.0416 to 0.0410
all model_0 saved




Model: 0, Epoch [22/30], Loss: 0.0122
Model: 0, Epoch [22/30], Test Loss: 0.0444
Test loss has not improved; early stopping counter: 1
Model: 0, Epoch [23/30], Loss: 0.0087
Model: 0, Epoch [23/30], Test Loss: 0.0463
Test loss has not improved; early stopping counter: 2
Model: 0, Epoch [24/30], Loss: 0.0066
Model: 0, Epoch [24/30], Test Loss: 0.0393
New best model found! Test Loss improved from 0.0410 to 0.0393
all model_0 saved




Model: 0, Epoch [25/30], Loss: 0.0063
Model: 0, Epoch [25/30], Test Loss: 0.0386
New best model found! Test Loss improved from 0.0393 to 0.0386
all model_0 saved




Model: 0, Epoch [26/30], Loss: 0.0105
Model: 0, Epoch [26/30], Test Loss: 0.0450
Test loss has not improved; early stopping counter: 1
Model: 0, Epoch [27/30], Loss: 0.0105
Model: 0, Epoch [27/30], Test Loss: 0.0498
Test loss has not improved; early stopping counter: 2
Model: 0, Epoch [28/30], Loss: 0.0071


KeyboardInterrupt: 

i have developed a model with the first parameters -> the rest will come later