# Final model

This code recreates the best model using the optimization method (Newton-Raphson Method) found in the previous study (_2_Analysis_and_Optimization.ipynb_), this time using 100% of the data as training.

It is possible to make predictions using the values from the last code.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

from IPython.display import display, HTML
from sklearn.preprocessing import LabelEncoder
from matplotlib.lines import Line2D

In [2]:
dataset = pd.read_csv("dataset/dataset_final.csv")

variables = {
    'LapTime': 'Lap time (s)',
    'EventName': 'Track name',
    'Team': 'Team name',
    'Compound': 'Tyre compound',
    'TyreLife': 'Tyre life (laps)',
    'FreshTyre': 'Fresh tyre',
    'FuelLevel': 'Fuel level (kg)',
    'AirTemp': 'Air temperature (°C)',
    'Humidity': 'Humidity (%)',
    'Pressure': 'Air pressure (mbar)',
    'Rainfall': 'Rainfall',
    'TrackTemp': 'Track temperature (°C)'
}


In [8]:
def standardize(X, mean=None, std=None):
    if mean is None: mean = np.mean(X, axis=0)
    if std is None: std = np.std(X, axis=0)
    return (X - mean) / std, mean, std

def destandardize(X_std, mean, std):
    return X_std * std + mean

def one_hot_encode(values, categories=None):
    if categories is None:
        categories = sorted(set(values))
    one_hot = np.zeros((len(values), len(categories)))
    for i, val in enumerate(values):
        if val in categories:
            one_hot[i][categories.index(val)] = 1
    return one_hot, categories

def build_model_data(X, y):
    X = np.c_[np.ones(X.shape[0]), X]
    return X, y

lap_time = dataset['LapTime']
event_name = dataset['EventName']
team = dataset['Team']
compound = dataset['Compound']
tyre_life = dataset['TyreLife']
fuel_level = dataset['FuelLevel']
rainfall = dataset['Rainfall']
track_temp = dataset['TrackTemp']

X_cat_values = np.c_[
    event_name,
    team,
    compound,
    rainfall  
]

X_num = np.c_[
    tyre_life,
    track_temp,
    fuel_level
]

y = lap_time
encoded = []
categories_dict = {}
baseline_categories = []

for col in range(X_cat_values.shape[1]):
    vals = X_cat_values[:, col]
    oh, cats = one_hot_encode(vals)
    oh = oh[:, 1:]
    
    encoded.append(oh)
    categories_dict[col] = cats[1:]
    baseline_categories.append((col, cats[0]))

X_cat_oh = np.hstack(encoded)
X_num_std, mu_X, std_X = standardize(X_num)
y_std, mu_y, std_y = standardize(y)

X_full = np.c_[X_cat_oh, X_num_std]
X, y = build_model_data(X_full, y_std)

cat_keys = ['EventName', 'Team', 'Compound', 'Rainfall']  
cat_labels = [variables[key] for key in cat_keys]

print("Categorical variables (with dropped baseline):")
for idx, baseline in baseline_categories:
    label = cat_labels[idx]
    print(f"'{label}': baseline = '{baseline}'")


Categorical variables (with dropped baseline):
'Track name': baseline = 'Abu Dhabi Grand Prix'
'Team name': baseline = 'Alpine'
'Tyre compound': baseline = 'HARD'
'Rainfall': baseline = 'False'


In [9]:
def loss(w, X, y):
    n = X.shape[0]
    residuals = X @ w - y
    return (1 / (2 * n)) * np.sum(residuals ** 2)

def grad(w, X, y):
    n = X.shape[0]
    residuals = X @ w - y
    return (1 / n) * (X.T @ residuals)

def hess(w, X, y):
    n = X.shape[0]
    return (1 / n) * (X.T @ X)

def print_log(step, f_curr, grad_norm, batch_loss=None):
    if batch_loss is not None:
        print(f"[STEP {step:4d}] batch loss = {batch_loss:.5f} | f(x) = {f_curr:.5f} | ||grad|| = {grad_norm:.2e}")
    else:
        print(f"[STEP {step:4d}] f(x) = {f_curr:.5f} | ||grad|| = {grad_norm:.2e}")


In [10]:
def newton_raphson_method(
    x0, 
    X, 
    y, 
    max_iter=100, 
    tol=1e-4, 
    verbose=True
):
    x = x0.copy()
    x_vals = [x.copy()]
    f_vals = [loss(x, X, y)]

    for step in range(1, max_iter + 1):
        g = grad(x, X, y)
        H = hess(x, X, y)
        grad_norm = np.linalg.norm(g)
        f_curr = loss(x, X, y)

        if np.isnan(f_curr) or f_curr > 1e6:
            print("STOP - Divergence detected")
            break

        if verbose:
            print_log(step, f_curr, grad_norm)

        if grad_norm < tol:
            break

        try:
            delta = np.linalg.solve(H, g)
        except np.linalg.LinAlgError:
            print("STOP - Hessian not invertible")
            break

        x -= delta
        x_vals.append(x.copy())
        f_vals.append(loss(x, X, y))

    return f_vals, x_vals


In [11]:
x0 = np.zeros(X.shape[1])
max_iter = 10000

print("[TRAIN] Newton-Raphson Method:")
f_NR, x_NR = newton_raphson_method(x0, X, y, max_iter=max_iter)
x_final = x_NR[-1]

[TRAIN] Newton-Raphson Method:
[STEP    1] f(x) = 0.50000 | ||grad|| = 3.02e-01
[STEP    2] f(x) = 0.02051 | ||grad|| = 4.45e-16


In [12]:
event_categories = categories_dict[0]
team_categories = categories_dict[1]
compound_categories = categories_dict[2]
rainfall_categories = categories_dict[3]  

event_feature_names = [f"Track name [{cat}]" for cat in event_categories]
team_feature_names = [f"Team name [{cat}]" for cat in team_categories]
compound_feature_names = [f"Compound [{cat}]" for cat in compound_categories]
rainfall_feature_names = [f"Rainfall [{cat}]" for cat in rainfall_categories]

num_keys = ['TyreLife', 'TrackTemp', 'FuelLevel']
num_feature_names = [variables[k] for k in num_keys]

feature_names = (
    ["bias"] +
    event_feature_names +
    team_feature_names +
    compound_feature_names +
    rainfall_feature_names +  
    num_feature_names
)

coeff_df = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": x_final
})

def style_coeff(val, tol=1e-6):
    if abs(val) < tol:
        color = 'color: grey'
    else:
        color = 'color: green' if val > 0 else 'color: red'
    return f'{color}; font-weight: bold'

coeff_df_styled = (
    coeff_df.style
    .format({'Coefficient': '{:.3f}'})
    .map(style_coeff, subset=['Coefficient'])
)

coeff_df_styled


Unnamed: 0,Feature,Coefficient
0,bias,0.127
1,Track name [Australian Grand Prix],-0.689
2,Track name [Austrian Grand Prix],-1.761
3,Track name [Azerbaijan Grand Prix],1.636
4,Track name [Bahrain Grand Prix],0.77
5,Track name [Belgian Grand Prix],1.985
6,Track name [British Grand Prix],0.292
7,Track name [Canadian Grand Prix],-1.054
8,Track name [Chinese Grand Prix],1.163
9,Track name [Dutch Grand Prix],-1.216


In [15]:
def format_lap_time(seconds):
    minutes = int(seconds // 60)
    sec = int(seconds % 60)
    millis = int((seconds - int(seconds)) * 1000)
    return f"{minutes:02d}:{sec:02d}:{millis:03d}"


In [16]:
def predict_lap_time(
    event, team, compound, rainfall,             
    tyre_life, track_temp, fuel_level            
):
    input_cat = [event, team, compound, rainfall]
    input_encoded = []

    for idx, val in enumerate(input_cat):
        categories = categories_dict[idx]
        one_hot = np.zeros(len(categories))  
        if val in categories:
            one_hot[categories.index(val)] = 1
        input_encoded.extend(one_hot)

    input_num = np.array([tyre_life, track_temp, fuel_level])
    input_num_std = (input_num - mu_X) / std_X

    x_input = np.concatenate([[1], input_encoded, input_num_std])

    y_std_pred = np.dot(x_final, x_input)
    y_pred = destandardize(y_std_pred, mu_y, std_y)

    formatted_time = format_lap_time(y_pred)
    return y_pred, formatted_time


In [22]:
y_value, formatted = predict_lap_time(
    event='Bahrain Grand Prix',
    team='Ferrari',
    compound='MEDIUM',
    rainfall='No',
    tyre_life=4,
    track_temp=30.0,
    fuel_level=95
)

print(f"Predicted lap time: {formatted} (≈ {y_value:.3f} s)")


Predicted lap time: 01:38:793 (≈ 98.794 s)
