In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


In [12]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
data.feature_names


array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [2]:
import pandas as pd

df = pd.read_csv("data_features/sg_features_v1.csv")
df.head()


Unnamed: 0,year,Sector1Time_s,Sector2Time_s,Sector3Time_s,TyreLife,Driver_id,Team_id,Compound_id,final_grid_pos
0,2018,26.84,38.065,33.736,2,1,8,0,11
1,2018,26.468,37.284,32.95,2,2,9,0,4
2,2018,27.077,38.141,34.148,2,4,14,0,14
3,2018,27.253,38.28,34.081,2,5,15,0,15
4,2018,26.772,37.737,33.811,1,7,6,0,8


In [3]:
df.shape
df.columns
df.isna().sum().sort_values(ascending=False).head(10)

year              0
Sector1Time_s     0
Sector2Time_s     0
Sector3Time_s     0
TyreLife          0
Driver_id         0
Team_id           0
Compound_id       0
final_grid_pos    0
dtype: int64

In [4]:
TARGET = "final_grid_pos"

X = df.drop(columns=[TARGET])
y = df[TARGET]

X.head(), y.head()


(   year  Sector1Time_s  Sector2Time_s  Sector3Time_s  TyreLife  Driver_id  \
 0  2018         26.840         38.065         33.736         2          1   
 1  2018         26.468         37.284         32.950         2          2   
 2  2018         27.077         38.141         34.148         2          4   
 3  2018         27.253         38.280         34.081         2          5   
 4  2018         26.772         37.737         33.811         1          7   
 
    Team_id  Compound_id  
 0        8            0  
 1        9            0  
 2       14            0  
 3       15            0  
 4        6            0  ,
 0    11
 1     4
 2    14
 3    15
 4     8
 Name: final_grid_pos, dtype: int64)

In [5]:
train_df = df[df["year"] <= 2021]
test_df  = df[df["year"] >= 2022]

X_train = train_df.drop(columns=[TARGET])
y_train = train_df[TARGET]

X_test  = test_df.drop(columns=[TARGET])
y_test  = test_df[TARGET]

len(X_train), len(X_test)


(40, 60)

In [6]:
import numpy as np
from sklearn.metrics import mean_absolute_error

def evaluate_regression(y_true, y_pred):
    abs_err = np.abs(y_pred - y_true)
    return {
        "MAE": mean_absolute_error(y_true, y_pred),
        "Within±1": (abs_err <= 1).mean(),
        "Within±2": (abs_err <= 2).mean(),
        "Within±3": (abs_err <= 3).mean(),
    }


In [7]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

models = {
    "XGBoost": XGBRegressor(
        random_state=42,
        n_estimators=300,
        learning_rate=0.1,
        max_depth=4
    ),
    "LightGBM": LGBMRegressor(
        random_state=42,
        n_estimators=300,
        learning_rate=0.1
    ),
    "CatBoost": CatBoostRegressor(
        random_seed=42,
        iterations=300,
        learning_rate=0.1,
        depth=6,
        verbose=False,
        allow_writing_files=False
    )
}


In [8]:
rows = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    metrics = evaluate_regression(y_test, preds)
    rows.append({"Model": name, **metrics})

results = pd.DataFrame(rows)

# make it readable
for col in ["Within±1", "Within±2", "Within±3"]:
    results[col] = (results[col] * 100).round(1)

results["MAE"] = results["MAE"].round(2)

results


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000178 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 2
[LightGBM] [Info] Start training from score 10.500000


Unnamed: 0,Model,MAE,Within±1,Within±2,Within±3
0,XGBoost,5.46,10.0,23.3,36.7
1,LightGBM,5.0,10.0,20.0,30.0
2,CatBoost,4.85,13.3,26.7,40.0
