# Setup

In [None]:
!pip install optuna
!pip install joblib

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import glob
import numpy as np
import os
import pandas as pd
import random

import matplotlib.pyplot as plt

import optuna
import joblib

In [None]:
# Check if a GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda:0")  # Use the first GPU
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Dataset

In [None]:
class SpeedDataset(Dataset):
    def __init__(self, directory_paths, sequence_length):
        self.data = []
        self.labels = []
        self.timestamps = []

        # Load data
        all_data = self.load_data_from_directories(directory_paths)

        # print(f"Number of files: {len(all_data)}")

        # Process data
        for data in all_data:
            magnitude_averages, labels, timestamps = self.create_magnitude_averages(data, sequence_length)
            self.data.extend(magnitude_averages)
            self.labels.extend(labels)
            self.timestamps.extend(timestamps)

        self.data = torch.tensor(self.data, dtype=torch.float32)
        self.labels = torch.tensor(self.labels, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

    def load_data_from_directories(self, directory_paths):
      """
      Load and concatenate data from all CSV files in the specified directory.
      """
      all_data = []
      for directory_path in directory_paths:
        for filename in os.listdir(directory_path):
            if filename.endswith(".csv"):
                file_path = os.path.join(directory_path, filename)
                data = pd.read_csv(file_path, header=None, names=['timestamp', 'x_acc', 'y_acc', 'z_acc', 'gps_speed'])
                all_data.append(data)

      return all_data

    def create_magnitude_averages(self, data, sequence_length):
      """
      Create sequences and labels from the data.
      Each sequence consists of `sequence_length` rows, and the label is the GPS speed of the last row in each sequence.
      """
      datapoints = []
      labels = []
      timestamps = []

      for i in range(len(data) - sequence_length):
          sequence = np.abs(data.iloc[i:i+sequence_length][['x_acc', 'y_acc', 'z_acc']].values)
          average_magnitudes = np.mean(sequence, axis=0)

          label = sum(data.iloc[i: i + sequence_length]['gps_speed']) / sequence_length

          timestamp = data.iloc[i + sequence_length // 2]['timestamp']

          datapoints.append(average_magnitudes)
          labels.append(label)
          timestamps.append(timestamp)

      return np.array(datapoints), np.array(labels), np.array(timestamps)

    # For testing working with non-overlapping sequences
    def create_magnitues_averages_non_overlapping(self, data, sequence_length):
      """
      Create sequences and labels from the data.
      Each sequence consists of `sequence_length` rows, and the label is the GPS speed of the last row in each sequence.
      """
      datapoints = []
      labels = []
      timestamps = []
      for i in range(len(data) // sequence_length):
          sequence = np.abs(data.iloc[i * sequence_length : (i + 1) * sequence_length][['x_acc', 'y_acc', 'z_acc']].values)
          average_magnitudes = np.mean(sequence, axis=0)

          label = sum(data.iloc[i * sequence_length : (i + 1) * sequence_length]['gps_speed']) / sequence_length
          timestamp = data.iloc[i * sequence_length + sequence_length // 2]['timestamp']

          datapoints.append(average_magnitudes)
          labels.append(label)
          timestamps.append(timestamp)

      return np.array(datapoints), np.array(labels), np.array(timestamps)

    def get_speed_distribution(self):
      """
      Get the distribution of GPS speeds in the dataset.
      """

      print("Len labels", len(self.labels))
      speed_counts = [0, 0, 0, 0, 0, 0]
      for label in self.labels:
          label = int(label)
          if label > 5:
              speed_counts[5] += 1
          else:
              speed_counts[label] += 1

      # Bar chart
      plt.figure(figsize=(10, 4))
      plt.bar([f"{i}-{i+1}" if i < 5 else "5+" for i in range(len(speed_counts))], speed_counts)
      plt.xlabel('Speed (m/s)')
      plt.ylabel('Number of sequences')
      plt.show()


In [None]:
# Create training and testing datasets
train_dir = ['/content/drive/Shareddrives/CS229/Data/FinalSplits/train_axel/',
             '/content/drive/Shareddrives/CS229/Data/FinalSplits/train_jengchi/'


]
eval_dir = ['/content/drive/Shareddrives/CS229/Data/FinalSplits/eval_axel/']
test_dir = ['/content/drive/Shareddrives/CS229/Data/FinalSplits/test_axel/']

sequence_length = 40
train_dataset = SpeedDataset(train_dir + eval_dir, sequence_length)

In [None]:

train_dataset.get_speed_distribution()

## Data Loader

# Model

In [None]:
class LinearRegressionModel(nn.Module):
    def __init__(self):
        super(LinearRegressionModel, self).__init__()
        self.linear = nn.Linear(3, 1)

    def forward(self, x):
        return self.linear(x)

# Optimize Hyperparameters


In [None]:
def objective(trial):
    # Define the hyperparameters to optimize
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
    batch_size = trial.suggest_categorical("batch_size", [64, 128, 256, 512])
    # sequence_length = trial.suggest_int("sequence_length", 10, 80, step=10)

    sequence_length = 20

    # Instantiate the model
    model = LinearRegressionModel().to(device)

    # Loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    train_dataset = SpeedDataset(train_dir, sequence_length)
    eval_dataset = SpeedDataset(eval_dir, sequence_length)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Load training data
    X_train_tensor = train_dataset.data.to(device)
    y_train_tensor = train_dataset.labels.to(device)

    # Load evaluation data
    X_eval_tensor = eval_dataset.data.to(device)
    y_eval_tensor = eval_dataset.labels.to(device)

    # Training loop with early stopping
    max_epochs = 200
    patience = 20
    best_eval_loss = float('inf')
    epochs_without_improvement = 0

    for epoch in range(max_epochs):

        # Training
        model.train()
        for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
            X_tensor = X_batch.to(device)
            y_tensor = y_batch.to(device)

            # Forward pass
            predictions = model(X_tensor)
            predictions = torch.squeeze(predictions)
            loss = criterion(predictions, y_tensor)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        with torch.no_grad():
            eval_predictions = model(X_eval_tensor)
            eval_predictions = torch.squeeze(eval_predictions)
            eval_loss = criterion(eval_predictions, y_eval_tensor)

        # Check for improvement
        if eval_loss.item() < best_eval_loss:
            best_eval_loss = eval_loss.item()
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

        # Early stopping check
        if epochs_without_improvement >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

        # Print progress
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{max_epochs}, Eval Loss: {eval_loss.item():.4f}")

    return eval_loss.item()


In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)
study.set_user_attr("epochs", epochs_for_best_trial[0])
print("Best Hyperparameters:", study.best_params)

In [None]:
# Save study
joblib.dump(study, '/content/drive/Shareddrives/CS229/study_linear.pkl')

# Final training


In [None]:
# Load study
study = joblib.load('/content/drive/Shareddrives/CS229/study_linear.pkl')

In [None]:
# Use best hyperparameters
# lr = study.best_params["learning_rate"]
# batch_size = study.best_params["batch_size"]
# sequence_length = study.best_params["sequence_length"]

# print(f"Best learning rate: {lr}")
# print(f"Best batch size: {batch_size}")
# print(f"Best sequence length: {sequence_length}")

train_dir = [
    '/content/drive/Shareddrives/CS229/Data/FinalSplits/train_axel/',
    '/content/drive/Shareddrives/CS229/Data/FinalSplits/train_jengchi/'
]
eval_dir = ['/content/drive/Shareddrives/CS229/Data/FinalSplits/eval_axel/']

sequence_length = 40
lr = 8.0e-05
batch_size = 128

# Instantiate the model
model = LinearRegressionModel().to(device)

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

train_dataset = SpeedDataset(train_dir, sequence_length)
eval_dataset = SpeedDataset(eval_dir, sequence_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

# Load training data
X_train_tensor = train_dataset.data.to(device)
y_train_tensor = train_dataset.labels.to(device)

# Load evaluation data
X_eval_tensor = eval_dataset.data.to(device)
y_eval_tensor = eval_dataset.labels.to(device)

# Training loop with early stopping
max_epochs = 500
patience = 20
best_eval_loss = float('inf')
epochs_without_improvement = 0

train_losses = []
eval_losses = []

for epoch in range(max_epochs):

    # Training
    model.train()
    total_loss = 0.0
    for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
        X_tensor = X_batch.to(device)
        y_tensor = y_batch.to(device)

        # Forward pass
        predictions = model(X_tensor)
        predictions = torch.squeeze(predictions)
        loss = criterion(predictions, y_tensor)
        total_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    train_losses.append(average_loss)

    # Validation
    model.eval()
    with torch.no_grad():
        eval_predictions = model(X_eval_tensor)
        eval_predictions = torch.squeeze(eval_predictions)
        eval_loss = criterion(eval_predictions, y_eval_tensor)
        eval_losses.append(eval_loss.item())

    # Check for improvement
    if eval_loss.item() < best_eval_loss:
        best_eval_loss = eval_loss.item()
        epochs_for_best_trial[0] = epoch + 1
        epochs_without_improvement = 0
    else:
        epochs_without_improvement += 1

    # Early stopping check
    if epochs_without_improvement >= patience:
        print(f"Early stopping at epoch {epoch+1}")
        break

    print(f"Epoch {epoch+1}/{max_epochs}, Train Loss: {average_loss:.4f}, Eval Loss: {eval_loss.item():.4f}")




In [None]:
# Plot loss curves
plt.figure(figsize=(12, 6))
plt.plot(range(len(train_losses)), train_losses, label='Training Loss', color='blue')
plt.plot(range(len(eval_losses)), eval_losses, label='Validation Loss', color='red')
plt.ylim(0, 0.5)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Curves')
plt.legend()
plt.grid(True)
plt.show()

# Evaluation

### Calculate Metrics


In [None]:
test_dir = ['/content/drive/Shareddrives/CS229/Data/FinalSplits/test_axel/']
# test_dir = ['/content/drive/Shareddrives/CS229/Data/FinalSplits/test_jengchi/']

test_dataset = SpeedDataset(test_dir, sequence_length)


model.eval()

predicted_train = model(train_dataset.data.to(device)).detach()
predicted_train = torch.squeeze(predicted_train).to(device)

predicted_test = model(test_dataset.data.to(device)).detach()
predicted_test = torch.squeeze(predicted_test).to(device)

mse_train = (np.square(predicted_train.to("cpu") - train_dataset.labels.numpy())).mean(axis=0)
mse_test = (np.square(predicted_test.to("cpu") - test_dataset.labels.numpy())).mean(axis=0)

print(f"MSE Train: {mse_train}")
print(f"MSE Test: {mse_test}")

### Plot

In [None]:
predicted_test = model(test_dataset.data.to(device)).detach().to("cpu").numpy()

plt.figure(figsize=(12, 6))
plt.plot(test_dataset.timestamps, test_dataset.labels, label='Ground Truth GPS Speed', color='blue')
plt.plot(test_dataset.timestamps, predicted_test, label='Predicted GPS Speed', color='red')
# plt.plot(test_dataset.timestamps, test_dataset.data.sum(dim=1) / 8, label='Combined magnitudes', color='green')
plt.xlabel('Timestamp (ms since start)')
plt.ylabel('Speed (m/s)')
plt.title('Ground Truth vs. Predicted GPS Speed')
plt.legend()
plt.grid(True)
plt.show()