In [1]:
# Set up 
# Note see MOT_transformer_model_module for the full list of imports

import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
import os
import gc
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

# Set random seeds
torch.manual_seed(42)
np.random.seed(42)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Enable auto updates from imported modules
%load_ext autoreload
%autoreload 2

print(os.getcwd)
os.chdir("..")
script_path = os.getcwd()
data_path = os.path.join(script_path, "data")


KeyboardInterrupt: 

In [None]:
# 1. Read in training data

print("Reading in training data...")
data_fuels = []
for veh_name in ["car_diesel", "car_petrol"]:
    print(f"\tReading in {veh_name} data...")
    fpath = os.path.join(data_path, "Sample_Data", f"transformer_training_data_{veh_name}.parquet")
    data = pd.read_parquet(fpath, engine='pyarrow')
    
    # Only keep the columns we need for training
    categorical_cols = ['fuel_type', 'last_test', ]
    numerical_cols = ['mileage_per_year', 'test_mileage', 'age_year', 'time_between_tests']
    training_cols = ['vehicle_id'] + categorical_cols + numerical_cols
    data = data[training_cols]
    
    print(f"\tSample: {data['vehicle_id'].nunique()}")

    data_fuels.append(data)
    
data = pd.concat(data_fuels, ignore_index=True)

print("Combined Sample: ", data['vehicle_id'].nunique())

del data_fuels
gc.collect()

print(f"Data loaded. Shape: {data.shape}")
print(data.describe().to_string())
print(data.info())

In [None]:
# 2. Prepare training data 

from MOT_transformer_model_module import prepare_training_data, validate_dataset

batch_size= 15_000 

# Prepare data 
train_dataset, test_dataset, label_encoders, scaler = prepare_training_data(
    data, 
    test_size=0.2, 
    batch_size=batch_size
)

# Create data loaders
train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        pin_memory=True
    )
test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        pin_memory=True
    )

print("Validating training data:")
validate_dataset(train_loader)
print("\nValidating test data:")
validate_dataset(test_loader)
print()

# Save label encoders and scaler
torch.save(label_encoders, "label_encoders.pt")
torch.save(scaler, "scaler.pt")

# Save the training and test data loaders
torch.save(train_loader, "train_loader.pt")
torch.save(test_loader, "test_loader.pt")


In [None]:
# 3. Set up model 

from MOT_transformer_model_module import VehicleTransformer

# Set up model

# Model parameters
input_dim = len(data.columns) - 1  # Subtract 1 for vehicle_id column that is dropped during sequence creation
d_model = 128
nhead = 8
num_layers = 6
dim_feedforward = 256
num_epochs = 2

# Initialize model
model = VehicleTransformer(
    input_dim=input_dim,
    d_model=d_model,
    nhead=nhead,
    num_layers=num_layers,
    dim_feedforward=dim_feedforward
).to(device)


In [None]:
# 4. Train model

from MOT_transformer_model_module import train_model

# load the loaders
train_loader = torch.load("train_loader.pt")
test_loader = torch.load("test_loader.pt")

train_metrics, val_metrics, best_threshold, threshold_results = train_model(
    model, train_loader, test_loader, num_epochs=num_epochs, device=device
)


In [None]:
# 5. Analyze model predictions on test data

from MOT_transformer_model_module import analyze_model_predictions, transformer_figure

# Load the loader, label encoders and scaler - don't need to rerun previous cells
test_loader = torch.load("test_loader.pt")
label_encoders = torch.load("label_encoders.pt")
scaler = torch.load("scaler.pt")


results = analyze_model_predictions(model, test_loader, scaler, device, best_threshold=0.32)
fig = transformer_figure(results)


In [None]:
# 6. Read in data for predictions:

print("Reading in prediction data...")
data_fuels = []
for veh_name in ["car_diesel", "car_petrol", "car_bev", "car_hev"]:
    print(f"\tReading in {veh_name} data...")
    fpath = os.path.join(data_path, "Sample_Data", f"transformer_prediction_data_{veh_name}.parquet")
    data = pd.read_parquet(fpath, engine='pyarrow')
    
    # Only keep the columns we need for predictions
    categorical_cols = ['fuel_type', 'last_test']
    numerical_cols = ['mileage_per_year', 'test_mileage', 'age_year', 'time_between_tests']#, 'test_mileage_age_indicator', 'mileage_per_year_age_indicator', 'taxi_indicator']
    predictions_cols = ['vehicle_id', 'test_year', 'make', 'model',
                     'first_use_year', 'fuel efficiency Wh/mi', 'battery capacity (kWh)',
                     'CO2 g/km', 'mass (kg)'
                     ] + categorical_cols + numerical_cols
    data = data[predictions_cols]
    
    data_fuels.append(data)
    
data = pd.concat(data_fuels, ignore_index=True)

print("Combined Sample: ", data['vehicle_id'].nunique())

# Next show the distribution of fuel types
for fuel in ["DI", "PE", "EL", "HY"]:
    n = data.loc[data['fuel_type'] == fuel, 'vehicle_id'].nunique()
    print(f"{fuel}: {n} vehicles")
    
# Change "EL" and "HY" to "DI" or "PE" based upon mileage
# First create a copy of the original fuel type column
data["original_fuel_type"] = data["fuel_type"]
# Catgorise EL and HY based upon mileage criteria:
for first_use_year in range(2005, 2021, 1):
    di_mileage = data.loc[(data["fuel_type"] == "DI") & (data["first_use_year"] == first_use_year) & (data["last_test"] == True), "test_mileage"].mean()
    pe_mileage = data.loc[(data["fuel_type"] == "PE") & (data["first_use_year"] == first_use_year) & (data["last_test"] == True), "test_mileage"].mean()
    cut_off = (di_mileage + pe_mileage) / 2
    print(f"Cut-off for {first_use_year}: {round(cut_off)} miles")

    # Get the ids of the HY and EL that are now DI and PE for this first_use_year
    el_di_ids = data.loc[(data["fuel_type"] == "EL") & (data["first_use_year"] == first_use_year) & (data["last_test"] == True) & (data["test_mileage"] >= cut_off), "vehicle_id"].unique()
    el_pe_ids = data.loc[(data["fuel_type"] == "EL") & (data["first_use_year"] == first_use_year) & (data["last_test"] == True) & (data["test_mileage"] < cut_off), "vehicle_id"].unique()
    hy_di_ids = data.loc[(data["fuel_type"] == "HY") & (data["first_use_year"] == first_use_year) & (data["last_test"] == True) & (data["test_mileage"] >= cut_off), "vehicle_id"].unique()
    hy_pe_ids = data.loc[(data["fuel_type"] == "HY") & (data["first_use_year"] == first_use_year) & (data["last_test"] == True) & (data["test_mileage"] < cut_off), "vehicle_id"].unique()
    
    # Now set these ids as DI or PE
    data.loc[data["vehicle_id"].isin(el_di_ids), "fuel_type"] = "DI"
    data.loc[data["vehicle_id"].isin(el_pe_ids), "fuel_type"] = "PE"
    data.loc[data["vehicle_id"].isin(hy_di_ids), "fuel_type"] = "DI"
    data.loc[data["vehicle_id"].isin(hy_pe_ids), "fuel_type"] = "PE"
    
# Finally check if there are any vehicles that have not been reclassified
n_el = data.loc[data["fuel_type"] == "EL", "vehicle_id"].nunique()
n_hy = data.loc[data["fuel_type"] == "HY", "vehicle_id"].nunique()
print(f"EL: {n_el} vehicles not reclassified")
print(f"HY: {n_hy} vehicles not reclassified")
# defualt these to DI
data.loc[data["fuel_type"] == "EL", "fuel_type"] = "DI"
data.loc[data["fuel_type"] == "HY", "fuel_type"] = "DI"

# Now print the number of ELs now classified as DI and PE
el_de = data.loc[(data["original_fuel_type"] == "EL") &
                (data["fuel_type"] == "DI"), "vehicle_id"].nunique()
el_pe = data.loc[(data["original_fuel_type"] == "EL") &
                (data["fuel_type"] == "PE"), "vehicle_id"].nunique()
hy_de = data.loc[(data["original_fuel_type"] == "HY") &
                (data["fuel_type"] == "DI"), "vehicle_id"].nunique()
hy_pe = data.loc[(data["original_fuel_type"] == "HY") &
                (data["fuel_type"] == "PE"), "vehicle_id"].nunique()
de_original = data.loc[data["original_fuel_type"] == "DI", "vehicle_id"].nunique()
pe_original = data.loc[data["original_fuel_type"] == "PE", "vehicle_id"].nunique()
print(f"EL: {el_de} DI, {el_pe} PE")
print(f"HY: {hy_de} DI, {hy_pe} PE")
print(f"DI: {de_original} original, {el_de + hy_de} reclassified")
print(f"PE: {pe_original} original, {el_pe + hy_pe} reclassified")

# Finally reset last_test for all tests after 2022 to be False
data.loc[data['test_year'] >= 2022, 'last_test'] = False

del data_fuels
gc.collect()

print(f"Data loaded. Shape: {data.shape}")
print(data.describe().to_string())
print(data.info())

In [None]:
# 7. Predctions
from MOT_transformer_model_module import generate_predictions

# Load the saved state dict
if device.type == 'cuda':
    torch.cuda.empty_cache()
model.load_state_dict(torch.load('best_model.pth'))
model.eval()  # Put the model in evaluation mode

# Load scaler and label encoders - don't need to run earlier code if loading from here
scaler = torch.load("scaler.pt")
label_encoders = torch.load("label_encoders.pt")

data['simulated_data'] = False
data['scrap_probability'] = 0.0

# First remove vehicles that have already been scrapped
ids = data.loc[data["last_test"]==True, "vehicle_id"].unique()
scrapped_data = data[data["vehicle_id"].isin(ids)] 
data = data[~data["vehicle_id"].isin(ids)]
# output the scrapped data
fname = "simulation_data_0.csv"
fpath = os.path.join("simulation_data", fname)
scrapped_data.to_csv(fpath, index=False)

# Simulate the next test for all vehicles  
i = 1
while len(data) > 1:
    print(f"\nIteration {i}")
    print(f"Data shape: {data.shape}")
    print(f"Unique vehicles: {data['vehicle_id'].nunique()}")
    
    # Generate predictions for next tests
    # batch size much larger than training as we are not backpropagating
    new_data = generate_predictions(model, data, label_encoders, scaler, device, batch_size=50_000, threshold=0.22)
    
    # Concatenate with existing data and sort
    data = pd.concat([data, new_data], ignore_index=True)
    data = data.sort_values(['vehicle_id', 'test_year']).reset_index(drop=True)
    
    # Filter out vehicles that have been predicted to be scrapped and export them
    scrapped_ids = data.loc[data["last_test"]==True, "vehicle_id"].unique()
    data_scrapped = data[data["vehicle_id"].isin(scrapped_ids)]
    fname = f"simulation_data_{i}.csv"
    fpath = os.path.join("simulation_data", fname)
    data_scrapped.to_csv(fpath, index=False)
    
    # Briefly summarise the scrapped vehicle data
    print(f"Scrapped vehicles: {len(scrapped_ids)}")
    for fuel in ["DI", "PE", "EL", "HY"]:
        n = data_scrapped.loc[data_scrapped["original_fuel_type"]==fuel, "vehicle_id"].nunique()
        print(f"{fuel}: {n}")
    
    # Show the scrapped vehicles values counts by age
    print("Scrapped vehicles by age:")
    print(data_scrapped.loc[data["last_test"]==True, 'age_year'].round().value_counts().sort_index())
    # And finally the mileage distribution by fuel type
    print("Mileage by fuel type:")
    for fuel in ["DI", "PE", "EL", "HY"]:
        print(f"{fuel}: {data_scrapped.loc[(data_scrapped['original_fuel_type'] == fuel) & (data['last_test']==True), 'test_mileage'].mean()}")

    # Keep remaining vehicles for next round of predictions
    data = data[~data["vehicle_id"].isin(scrapped_ids)]
    
    # Break if iteration limit reached
    if i == 30:
        fname = "simulation_data_end.csv"
        fpath = os.path.join(data_path, "simulation_data", fname)
        data.to_csv(fpath, index=False)
        break
    
    i += 1

# Aggregate all the scrapped data
print("Aggregating scrapped data...")
df = []
for i in range(0, i, 1):
    print(f"\tReading in simulation_data_{i}.csv")
    fname = f"simulation_data_{i}.csv"
    fpath = os.path.join(data_path, "simulation_data", fname)
    df.append(pd.read_csv(fpath))

print("Outputting aggregated scrapped data...")
df = pd.concat(df, ignore_index=True)
fname = "simulation_data.csv"
fpath = os.path.join(data_path, "simulation_data", fname)
df.to_csv(fpath, index=False)