In [2]:
import numpy as np
import pandas as pd
import plotly.express as px
import sklearn.model_selection as skm
from joblib import parallel_backend
from lightgbm import LGBMClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (
    accuracy_score,
    auc,
    average_precision_score,
    f1_score,
    fbeta_score,
    make_scorer,
    precision_recall_curve,
    precision_score,
    recall_score,
    roc_curve,
)
from sklearn.model_selection import (
    RandomizedSearchCV,
    StratifiedKFold,
    cross_val_predict,
    cross_val_score,
)

In [3]:
# NUM_POSTE   : numéro Météo-France du poste sur 8 chiffres
# NOM_USUEL   : nom usuel du poste
# LAT         : latitude, négative au sud (en degrés et millionièmes de degré)
# LON         : longitude, négative à l’ouest de GREENWICH (en degrés et millionièmes de degré)
# ALTI        : altitude du pied de l'abri ou du pluviomètre si pas d'abri (en m)
# AAAAMMJJ    : date de la mesure (année mois jour)
# RR          : quantité de précipitation tombée en 24 heures (de 06h FU le jour J à 06h FU le jour J+1). La valeur relevée à J+1 est affectée au jour J (en mm et 1/10)
# TN          : température minimale sous abri (en °C et 1/10)
# HTN         : heure de TN (hhmm)
# TX          : température maximale sous abri (en °C et 1/10)
# HTX         : heure de TX (hhmm)
# TM          : moyenne quotidienne des températures horaires sous abri (en °C et 1/10)
# TNTXM       : moyenne quotidienne (TN+TX)/2 (en °C et 1/10)
# TAMPLI      : amplitude thermique quotidienne : écart entre TX et TN quotidiens (TX-TN) (en °C et 1/10)
# TNSOL       : température quotidienne minimale à 10 cm au-dessus du sol (en °C et 1/10)
# TN50        : température quotidienne minimale à 50 cm au-dessus du sol (en °C et 1/10)
# DG          : durée de gel sous abri (T ≤ 0°C) (en mn)
# FFM         : moyenne quotidienne de la force du vent moyenné sur 10 mn, à 10 m (en m/s et 1/10)
# FF2M        : moyenne quotidienne de la force du vent moyenné sur 10 mn, à 2 m (en m/s et 1/10)
# FXY         : maximum quotidien de la force maximale horaire du vent moyenné sur 10 mn, à 10 m (en m/s et 1/10)
# DXY         : direction de FXY (en rose de 360)
# HXY         : heure de FXY (hhmm)
# FXI         : maximum quotidien de la force maximale horaire du vent instantané, à 10 m (en m/s et 1/10)
# DXI         : direction de FXI (en rose de 360)
# HXI         : heure de FXI (hhmm)
# FXI2        : maximum quotidien de la force maximale horaire du vent instantané, à 2 m (en m/s et 1/10)
# DXI2        : direction de FXI2 (en rose de 360)
# HXI2        : heure de FXI2 (hhmm)
# FXI3S       : maximum quotidien de la force maximale horaire du vent moyenné sur 3 s, à 10 m (en m/s et 1/10)
# DXI3S       : direction de FXI3S (en rose de 360)
# HXI3S       : heure de FXI3S (hhmm)
# DRR         : durée des précipitations (en mn)

# A chaque donnée est associé un code qualité (ex: T;QT) :
#  9 : donnée filtrée (la donnée a passé les filtres/contrôles de premiers niveaux)
#  0 : donnée protégée (la donnée a été validée définitivement par le climatologue)
#  1 : donnée validée (la donnée a été validée par contrôle automatique ou par le climatologue)
#  2 : donnée douteuse en cours de vérification (la donnée a été mise en doute par contrôle automatique)

# D'une façon générale, les valeurs fournies sont données avec une précision qui correspond globalement à la résolution de l'appareil de mesure de la valeur.
# Toutefois, il peut arriver, pour des raisons techniques de stokage ou d'extraction des valeurs, que cette règle ne soit pas respectée.
# Du fait d'arrondis, il peut ponctuellement arriver que des valeurs de base à un pas de temps inférieur (par exemple données minutes) ne soient pas exactement cohérentes avec leurs correspondants sur un pas de temps supérieur (par exemple données horaires).

In [4]:
df = pd.read_feather("../data/merged_meteo_red_days_from_20170101.feather")
print(df.columns)
df.head()

ImportError: Missing optional dependency 'pyarrow'.  Use pip or conda to install pyarrow.

In [None]:
# Remove days after 20240311
df = df[df["AAAAMMJJ"] < 20240401]
df.tail()

Unnamed: 0,AAAAMMJJ,TN_BORDEAUX,TN_LILLE,TN_LYON,TN_MARSEILLE,TN_MONTPELLIER,TN_NANTES,TN_NICE,TN_PARIS,TN_REIMS,...,TAMPLI_MONTPELLIER,TAMPLI_NANTES,TAMPLI_NICE,TAMPLI_PARIS,TAMPLI_REIMS,TAMPLI_RENNES,TAMPLI_STRASBOURG,TAMPLI_TOULON,TAMPLI_TOULOUSE,is_red_day
2622,20240307,5.7,3.4,0.3,0.8,0.5,4.0,7.4,3.8,-0.6,...,12.5,10.2,7.2,10.4,13.7,11.0,5.5,13.4,10.7,True
2623,20240308,8.9,2.4,4.8,7.0,8.7,6.9,6.9,3.6,-1.0,...,4.5,5.2,6.4,10.3,15.2,6.5,12.3,5.5,5.6,False
2624,20240309,7.5,5.2,5.9,7.7,9.9,6.2,8.2,6.4,2.0,...,3.5,3.7,4.7,8.1,12.7,4.5,12.3,4.4,6.3,False
2625,20240310,3.3,7.6,5.7,9.2,6.3,0.9,8.4,7.1,5.5,...,8.8,11.9,4.9,4.3,5.5,13.1,5.3,5.0,12.9,False
2626,20240311,4.5,7.6,5.1,6.7,7.6,3.9,8.1,6.4,0.2,...,7.6,10.1,5.4,6.6,13.7,13.9,10.5,8.4,4.3,True


In [None]:
# Count nan values and show nan cols:

print(df.isna().sum())
pd.set_option("display.max_rows", None)
df.columns[df.isna().sum() > 0]

AAAAMMJJ                0
TN_BORDEAUX             0
TN_LILLE                0
TN_LYON                 0
TN_MARSEILLE            0
TN_MONTPELLIER        365
TN_NANTES               0
TN_NICE                 0
TN_PARIS                0
TN_REIMS                0
TN_RENNES               0
TN_STRASBOURG           0
TN_TOULON               0
TN_TOULOUSE             0
TX_BORDEAUX             0
TX_LILLE                0
TX_LYON                 0
TX_MARSEILLE            0
TX_MONTPELLIER        365
TX_NANTES               0
TX_NICE                 0
TX_PARIS                0
TX_REIMS                0
TX_RENNES               0
TX_STRASBOURG           0
TX_TOULON               0
TX_TOULOUSE             0
TNTXM_BORDEAUX          0
TNTXM_LILLE             0
TNTXM_LYON              0
TNTXM_MARSEILLE         0
TNTXM_MONTPELLIER     365
TNTXM_NANTES            0
TNTXM_NICE              0
TNTXM_PARIS             0
TNTXM_REIMS             0
TNTXM_RENNES            0
TNTXM_STRASBOURG        0
TNTXM_TOULON

Index(['TN_MONTPELLIER', 'TX_MONTPELLIER', 'TNTXM_MONTPELLIER',
       'TAMPLI_MONTPELLIER'],
      dtype='object')

In [None]:
# drop montepellier etc
cities_to_drop = ["MONTPELLIER", "REIMS", "RENNES", "NICE"]


df.drop(
    columns=[c for c in df.columns if c.split("_")[-1] in cities_to_drop],
    axis=1,
    inplace=True,
)
assert df.isna().sum().sum() == 0, "There are still NaN values in the dataframe"

Feature engineering

In [None]:
# Add a is week day feature:
daydt = pd.to_datetime(df["AAAAMMJJ"], format="%Y%m%d").dt
df["is_week_day"] = (daydt.dayofweek < 5).astype('category')


# Feature as int
df["is_red_day"] = df["is_red_day"].astype(int)

# if last day was red feature
df["last_day_was_red"] = df["is_red_day"].shift(1).fillna(0).astype('category')

# Red days in last week feature
df["red_days_last_week"] = (
    df["is_red_day"]
    .rolling(window=7, min_periods=1)
    .sum()
    .shift(1)
    .fillna(0)
    .astype(bool)
)
df.drop(columns=["last_day_was_red"], inplace=True)

# Month feature
df["month"] = daydt.month.astype('category')

In [None]:
# Remove data between  01/04 and 01/11 and prior to 2017-04
df = df[~((daydt.month >= 4) & (daydt.month <= 10))]
df = df[~((daydt.year == 2017) & (daydt.month < 4))]

  df = df[~((daydt.year == 2017) & (daydt.month < 4))]


Learning

In [None]:
#  show values count in y
df["is_red_day"].value_counts()

is_red_day
0    892
1    147
Name: count, dtype: int64

In [None]:
def confusion_table(
    y_true: np.ndarray | list, y_pred: np.ndarray | list
) -> pd.DataFrame:
    """Rows: Actual, Columns: Predicted. Aligns inputs by position (not index)."""
    y_true_series = pd.Series(np.asarray(y_true), name="Actual")
    y_pred_series = pd.Series(np.asarray(y_pred), name="Predicted")

    if len(y_true_series) != len(y_pred_series):
        raise ValueError(
            f"y_true and y_pred must have same length, got {len(y_true_series)} and {len(y_pred_series)}"
        )

    ct = pd.crosstab(y_true_series, y_pred_series)
    return ct


def evaluate(model, X, y, plot_roc_rurve=False, threshold=0.5):
    y_probs = model.predict_proba(X)[:, 1]
    y_pred = (y_probs >= threshold).astype(int)

    print(f"Accuracy: {accuracy_score(y, y_pred)}")
    print("Precision", precision_score(y, y_pred))
    print("Recall", recall_score(y, y_pred))
    print("F1 score", f1_score(y, y_pred))
    print("F2 score", fbeta_score(y, y_pred, beta=2))
    print("Average precision (AUPRC)", average_precision_score(y, y_probs))

    display(confusion_table(y, y_pred))

    if plot_roc_rurve:
        y_pred_proba = model.predict_proba(X)[:, 1]
        fpr, tpr, _ = roc_curve(y, y_pred_proba)
        roc_auc = auc(fpr, tpr)
        fig = px.area(
            x=fpr,
            y=tpr,
            title=f"ROC Curve (AUC={roc_auc:.4f})",
            labels=dict(x="False Positive Rate", y="True Positive Rate"),
            width=700,
            height=500,
        )
        fig.show()

In [None]:
X = df.drop(["is_red_day", "AAAAMMJJ"], axis=1)
todrop = []
for c in X.columns:
    if "TAMPLI" in c or "RR" in c or "FFM" in c or "TM" in c or "TNTXM" in c:
        todrop.append(c)


X = X.drop(todrop, axis=1)
y = df["is_red_day"]


X_train, X_test, y_train, y_test = skm.train_test_split(
    X, y, test_size=0.2, random_state=42
)
f2_scorer = make_scorer(fbeta_score, beta=2)

In [None]:
X.columns

Index(['TN_BORDEAUX', 'TN_LILLE', 'TN_LYON', 'TN_MARSEILLE', 'TN_NANTES',
       'TN_PARIS', 'TN_STRASBOURG', 'TN_TOULON', 'TN_TOULOUSE', 'TX_BORDEAUX',
       'TX_LILLE', 'TX_LYON', 'TX_MARSEILLE', 'TX_NANTES', 'TX_PARIS',
       'TX_STRASBOURG', 'TX_TOULON', 'TX_TOULOUSE', 'is_week_day',
       'red_days_last_week', 'month'],
      dtype='object')

In [None]:
# Define the parameter grid


SEED = 42
RUN_HP_TUNING = True

if RUN_HP_TUNING:
    param_grid = {
        "n_estimators": [100, 200, 500],
        "learning_rate": [0.01, 0.05],
        "max_depth": [3, 5, 10],
        "num_leaves": [15, 31],
        "min_child_samples": [20, 50, 100],
        "reg_alpha": [0.1, 1, 10],
        "reg_lambda": [0.1, 1, 10],
        "scale_pos_weight": [5, 10, 20],
    }

    # Perform GridSearchCV
    lgbm = LGBMClassifier(random_state=SEED, force_row_wise=True, verbose=-1, n_jobs=-1)

    kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)
    random_search = RandomizedSearchCV(
        lgbm,
        param_grid,
        scoring="average_precision",  # This is the PR AUC alias,
        n_iter=50,
        cv=kf,
        verbose=1,
        n_jobs=-1,
        random_state=SEED,
    )

    with parallel_backend("loky"):
        random_search.fit(X_train, y_train)

    # Print the best parameters and evaluate
    print("Best parameters found: ", random_search.best_params_)
    print("Best score:", random_search.best_score_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters found:  {'scale_pos_weight': 5, 'reg_lambda': 1, 'reg_alpha': 10, 'num_leaves': 15, 'n_estimators': 100, 'min_child_samples': 50, 'max_depth': 3, 'learning_rate': 0.05}
Best score: 0.6381369748676083


In [None]:
# Obtained with random search
best_params =  {
    "scale_pos_weight": 5,
    "reg_lambda": 1,
    "reg_alpha": 10,
    "num_leaves": 15,
    "n_estimators": 100,
    "min_child_samples": 50,
    "max_depth": 3,
    "learning_rate": 0.05,
}

lgbm = LGBMClassifier(random_state=0, force_row_wise=True, verbose=-1, **best_params)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# We use cv to ensure data efficiency
calibrated_lgbm = CalibratedClassifierCV(
    estimator=lgbm,
    method="sigmoid",  # Platt scaling
    cv=kf,
    ensemble=False, # for easier onnx export
)
calibrated_lgbm.fit(X_train, y_train)

# print cv average_precision scores
cv_scores = cross_val_score(
    calibrated_lgbm,
    X_train,
    y_train,
    scoring="average_precision",
    cv=kf,
)
print("CV Average Precision scores:", cv_scores)
print("Mean CV Average Precision:", cv_scores.mean())

CV Average Precision scores: [0.64681238 0.68184614 0.58328588 0.64252494 0.66743472]
Mean CV Average Precision: 0.6443808122003875


In [5]:
def plot_cv_pr_curve(calibrated_lgbm, X_train, y_train, kf):
    """Plot Cross-Validated Precision-Recall Curve using OOF probabilities"""
    y_oof_probs = cross_val_predict(
        calibrated_lgbm, X_train, y_train, cv=kf, method="predict_proba"
    )[:, 1]
    precision, recall, thresholds = precision_recall_curve(y_train, y_oof_probs)
    pr_auc = average_precision_score(y_train, y_oof_probs)

    # We add a 1 to the end of thresholds to match the length of precision/recall
    thresholds_padded = np.append(thresholds, 1)
    pr_curve_df = pd.DataFrame(
        {"Recall": recall, "Precision": precision, "Threshold": thresholds_padded}
    )

    fig = px.area(
        pr_curve_df,
        x="Recall",
        y="Precision",
        title=f"Cross-Validated Precision-Recall Curve (PR-AUC = {pr_auc:.4f})",
        hover_data=["Threshold"],
        labels=dict(
            x="Recall (Ability to catch positives)",
            y="Precision (Accuracy of positive calls)",
        ),
        width=700,
        height=500,
    )

    fig.show()


print("Generating Out-of-Fold probabilities...")
plot_cv_pr_curve(calibrated_lgbm, X_train, y_train, kf)

Generating Out-of-Fold probabilities...


NameError: name 'calibrated_lgbm' is not defined

In [6]:
THRESHOLD = 0.25

print("Train set")
evaluate(calibrated_lgbm, X_train, y_train, threshold=THRESHOLD)

print("Test set")
evaluate(calibrated_lgbm, X_test, y_test, threshold=THRESHOLD)

Train set


NameError: name 'evaluate' is not defined

Explore incorrect preds

In [69]:
# Explore incorrect predictions
y_pred_proba = calibrated_lgbm.predict_proba(X_test)[:, 1]
X_test_indices = X_test.index
df_test = df.loc[X_test_indices].copy()
df_test["y_pred_proba"] = y_pred_proba
df_test["y_pred"] = (y_pred_proba >= THRESHOLD).astype(int)

incorrect_preds = df_test[df_test["is_red_day"] != df_test["y_pred"]]
print(len(incorrect_preds), "incorrect predictions")
assert all(
    incorrect_preds["y_pred"] != incorrect_preds["is_red_day"]
), "y_pred should be different from is_red_day"
incorrect_preds.head(10)

23 incorrect predictions


Unnamed: 0,AAAAMMJJ,TN_BORDEAUX,TN_LILLE,TN_LYON,TN_MARSEILLE,TN_NANTES,TN_PARIS,TN_STRASBOURG,TN_TOULON,TN_TOULOUSE,...,TAMPLI_PARIS,TAMPLI_STRASBOURG,TAMPLI_TOULON,TAMPLI_TOULOUSE,is_red_day,is_week_day,red_days_last_week,month,y_pred_proba,y_pred
2257,20230308,7.9,0.8,1.9,4.7,7.6,5.3,2.9,6.4,3.6,...,9.8,12.7,10.8,17.5,1,True,True,3,0.112434,0
807,20190319,5.8,0.0,1.6,3.4,3.5,4.1,-0.3,3.4,5.7,...,9.5,12.1,11.7,5.7,1,True,False,3,0.113075,0
1853,20220128,-0.9,0.2,-2.7,-1.4,2.1,6.3,0.2,1.0,-2.6,...,1.2,5.8,13.6,5.5,0,True,True,1,0.508723,1
1845,20220120,5.2,1.4,0.7,1.7,2.9,3.8,1.3,4.0,3.9,...,3.3,4.2,10.3,2.9,0,True,True,1,0.563467,1
765,20190205,3.6,2.1,-2.1,1.9,3.7,3.1,-3.3,2.8,3.8,...,5.7,11.5,11.7,7.9,0,True,True,2,0.633103,1
417,20180222,-0.5,-2.4,-0.3,-0.5,-2.1,-1.2,-2.1,3.6,0.6,...,6.2,5.8,7.8,4.6,0,True,False,2,0.624221,1
443,20180320,1.3,-2.1,0.6,2.0,-0.2,-1.3,-2.8,6.7,2.4,...,10.1,7.4,8.8,2.4,0,True,False,3,0.454363,1
334,20171201,0.3,-0.7,0.0,-0.4,-0.5,0.8,0.1,3.7,1.9,...,3.7,2.5,5.2,2.8,0,True,True,12,0.695886,1
1849,20220124,-1.3,0.2,-2.8,0.7,0.5,0.7,-2.0,1.1,2.1,...,6.9,9.6,12.5,10.9,0,True,True,1,0.706879,1
1500,20210209,5.2,-6.5,1.5,5.6,-1.1,-2.9,-3.5,7.4,6.2,...,2.3,1.8,7.3,7.1,0,True,False,2,0.301487,1


Export to onnx

In [None]:
from onnxmltools.convert import convert_lightgbm
import onnx
from onnxmltools.convert.common.data_types import FloatTensorType

# from onnxsim import simplify
from onnx import helper

X_train_np = X_train.to_numpy().astype(np.float32)
initial_types = [("input", FloatTensorType([None, X_train_np.shape[1]]))]
input_names_str = ",".join(X_train.columns)

onnx_model = convert_lightgbm(
    lgbm,
    initial_types=initial_types,
    target_opset=12,
    zipmap=False,
    doc_string=f"Predict if the next day will be a red day.\n\nInput names: {input_names_str}",
)

# Simplify the ONNX model to avoid unsupported operators in app later on
# simp_model, check = simplify(onnx_model)
# assert check, "Simplified ONNX model could not be validated"
simp_model, check = onnx_model, True

model = simp_model


NotFittedError: No classes found. Need to call fit beforehand.

In [107]:
import onnxmltools
from onnxmltools.convert.common.data_types import FloatTensorType

# Convert base LGBM (raw scores)
initial_types = [("input", FloatTensorType([None, X_train.shape[1]]))]

lgbm_onnx = onnxmltools.convert_lightgbm(
    cal.estimator,  # trained LGBM
    initial_types=initial_types,
    target_opset=12,
    zipmap=False,
    doc_string=f"Predict if the next day will be a red day.\n\nInput names: {input_names_str}",
)

In [108]:
from onnx import helper, TensorProto


def get_platt_scaling_params(calibrated_lgbm) -> tuple[float, float]:
    """Get Platt scaling parameters 'a' and 'b' from a CalibratedClassifierCV."""
    cal = calibrated_lgbm.calibrated_classifiers_[0]
    a = cal.calibrators[0].a_
    b = cal.calibrators[0].b_
    return a, b


def add_platt_scaling_to_onnx(
    lgbm_onnx: onnx.ModelProto, calibrated_lgbm: CalibratedClassifierCV
) -> onnx.ModelProto:
    """Add Platt scaling nodes to an existing LGBM ONNX model."""

    a, b = get_platt_scaling_params(calibrated_lgbm)

    # Create constants for 'a' and 'b'
    a_const = helper.make_tensor(
        name="a", data_type=TensorProto.FLOAT, dims=[], vals=[a]
    )
    b_const = helper.make_tensor(
        name="b", data_type=TensorProto.FLOAT, dims=[], vals=[b]
    )

    # Get existing ONNX graph
    graph = lgbm_onnx.graph

    # LGBM outputs both raw scores and probabilities at respectively output[0] and output[1]
    # We'll output name index 1
    output_index = 1
    raw_output_name = graph.output[output_index].name
    sigmoid_output_name = "probability"

    # Multiply node: score * a
    mul_node = helper.make_node(
        "Mul", inputs=[raw_output_name, "a"], outputs=["scaled_score"]
    )

    # Add node: scaled_score + b
    add_node = helper.make_node(
        "Add", inputs=["scaled_score", "b"], outputs=["shifted_score"]
    )

    # Sigmoid node
    sigmoid_node = helper.make_node(
        "Sigmoid", inputs=["shifted_score"], outputs=[sigmoid_output_name]
    )

    # Add nodes to graph
    graph.node.extend([mul_node, add_node, sigmoid_node])
    graph.initializer.extend([a_const, b_const])

    # Replace the old output with the new sigmoid output
    graph.output[output_index].name = sigmoid_output_name


add_platt_scaling_to_onnx(lgbm_onnx, calibrated_lgbm)
model = lgbm_onnx


In [155]:
# ...existing code...
def add_platt_scaling_to_onnx(
    lgbm_onnx: onnx.ModelProto, calibrated_lgbm: CalibratedClassifierCV
) -> onnx.ModelProto:
    """Add Platt scaling nodes to an existing LGBM ONNX model."""

    a, b = get_platt_scaling_params(calibrated_lgbm)

    graph = lgbm_onnx.graph

    # Get the 'scores' output (raw logits) from the original LGBM model (index 2)
    # This is typically a [N, 2] tensor where scores[:, 0] is for class 0 and scores[:, 1] for class 1
    raw_scores_output_name = graph.output[2].name

    # Create constants for 'a' and 'b'
    a_const = helper.make_tensor(
        name="a", data_type=TensorProto.FLOAT, dims=[], vals=[a]
    )
    b_const = helper.make_tensor(
        name="b", data_type=TensorProto.FLOAT, dims=[], vals=[b]
    )

    # Extract the score for the positive class (assuming it's the second column, index 1)
    # Use Gather to select the column for the positive class from the [N, 2] raw scores tensor
    gather_index = helper.make_tensor(
        name="gather_idx", data_type=TensorProto.INT64, dims=[], vals=[1]
    )
    gather_node = helper.make_node(
        "Gather",
        inputs=[raw_scores_output_name, "gather_idx"],
        outputs=["positive_class_raw_score"],
        axis=1  # Gather along the class dimension
    )

    # Multiply node: positive_class_raw_score * a
    mul_node = helper.make_node(
        "Mul", inputs=["positive_class_raw_score", "a"], outputs=["scaled_score"]
    )

    # Add node: scaled_score + b
    add_node = helper.make_node(
        "Add", inputs=["scaled_score", "b"], outputs=["shifted_score"]
    )

    # Sigmoid node to get P(class 1)
    prob_pos_name = "prob_positive_class"
    sigmoid_node = helper.make_node(
        "Sigmoid", inputs=["shifted_score"], outputs=[prob_pos_name]
    )

    # Compute P(class 0) = 1 - P(class 1)
    one_const = helper.make_tensor(
        name="one", data_type=TensorProto.FLOAT, dims=[], vals=[1.0]
    )
    prob_neg_name = "prob_negative_class"
    sub_node = helper.make_node(
        "Sub", inputs=["one", prob_pos_name], outputs=[prob_neg_name]
    )

    # Concatenate P(class 0) and P(class 1) into a [N, 2] tensor
    calibrated_probabilities_name = "probability"  # This will be the new output name
    concat_probs_node = helper.make_node(
        "Concat",
        inputs=[prob_neg_name, prob_pos_name],
        outputs=[calibrated_probabilities_name],
        axis=1,  # Concatenate along the class dimension
    )

    # Add new nodes and initializers to the graph
    graph.node.extend(
        [gather_node, mul_node, add_node, sigmoid_node, sub_node, concat_probs_node]
    )
    graph.initializer.extend([a_const, b_const, gather_index, one_const])

    # Update the model's outputs list
    # The new calibrated probabilities will replace the original 'probabilities' output (index 1)
    # The 'label' output (index 0) and the original 'scores' output (index 2) might still exist.
    
    # Create a new list of outputs, replacing the old probabilities output with the new calibrated one
    new_outputs = []
    for i, output_tensor_info in enumerate(graph.output):
        if i == 1: # This is the original 'probabilities' output
            # Replace with the new calibrated probability output info
            new_output_info = helper.make_tensor_value_info(
                calibrated_probabilities_name, TensorProto.FLOAT, [None, 2] # Explicitly define shape
            )
            new_outputs.append(new_output_info)
        elif i == 2: # This is the original 'scores' (raw logits) output, which is no longer a primary output
            continue # Skip adding it to the new list
        else: # Keep other outputs (like 'label' at index 0)
            new_outputs.append(output_tensor_info)

    graph.output.clear()
    graph.output.extend(new_outputs)

add_platt_scaling_to_onnx(lgbm_onnx, calibrated_lgbm)
model = lgbm_onnx
# ...existing code...

IndexError: list index out of range

In [None]:
# Now we will modify the onnx model to name the input features
graph = model.graph
original_input = graph.input[0]  # Assuming there is only one input

# Extract input name, type, and shape
input_name = original_input.name
input_type = original_input.type.tensor_type.elem_type  # Data type
old_shape = original_input.type.tensor_type.shape.dim

# Check current shape (should be [None, 20])
print("Original Shape:", [dim.dim_value for dim in old_shape])

# Ensure features_names are unique for ONNX inputs
# Convert to list to allow modification if needed
features_names_list = X_train.columns.tolist()
seen = set()
unique_features_names = []
for name in features_names_list:
    original_name = name
    counter = 1
    # Append suffix if name is already seen
    while name in seen:
        name = f"{original_name}_{counter}"
        counter += 1
    seen.add(name)
    unique_features_names.append(name)

# Create separate input tensors (None, 1) using unique names
new_inputs = [
    helper.make_tensor_value_info(unique_features_names[i], input_type, [None, 1])
    for i in range(len(unique_features_names))
]

# Create a new node that concatenates inputs (needed for LightGBM)
concat_node = helper.make_node(
    "Concat",
    inputs=unique_features_names,  # Use the unique names for concatenation
    outputs=["concatenated_input"],
    axis=1,  # feature axis
)

# Replace old input with new inputs and modify the first node to use `concatenated_input`
graph.input.remove(original_input)
graph.input.extend(new_inputs)

# Find the first node that takes the original input and modify it
for node in graph.node:
    for i, input_name_in_node in enumerate(node.input):
        if input_name_in_node == input_name:
            node.input[i] = "concatenated_input"

# Add the new Concat node at the beginning
graph.node.insert(0, concat_node)

# Save the modified model
onnx_model_path = "lgbm_model_red_days_2026_01_06.onnx"
onnx.save(model, onnx_model_path)

print(f"Modified ONNX model saved as {onnx_model_path}")

Original Shape: [0, 21]
Modified ONNX model saved as lgbm_model_red_days_2026_01_06.onnx


In [150]:
# Load model to test it in python using onnxruntime:
import onnxruntime as rt

sess = rt.InferenceSession(onnx_model_path)
input_names = [i.name for i in sess.get_inputs()]
label_name = sess.get_outputs()[1].name

# Test the model
X_test_np = X_test.to_numpy().astype(np.float32)
input_data = {
    input_name: np.expand_dims(X_test_np[..., i], axis=-1)
    for i, input_name in enumerate(input_names)
}
pred_onnx = sess.run([label_name], input_data)[0][:, 1]

y_pred = calibrated_lgbm.predict_proba(X_test)[:, 1]

# Compare the predictions
assert np.allclose(1 - y_pred, pred_onnx), "Predictions should be the same"

AssertionError: Predictions should be the same

In [152]:
sess.run([label_name], input_data)[0]

array([[0.20472258, 0.9839077 ],
       [0.19367915, 0.98496807],
       [0.3743288 , 0.9633797 ],
       [0.31628305, 0.97144794],
       [0.21661091, 0.9827354 ],
       [0.9264866 , 0.55532795],
       [0.3143667 , 0.97169316],
       [0.25991333, 0.97817373],
       [0.8057909 , 0.79138005],
       [0.58903635, 0.9165348 ],
       [0.20089364, 0.9842784 ],
       [0.19967353, 0.98439574],
       [0.6528626 , 0.8932624 ],
       [0.36418235, 0.96488607],
       [0.31546605, 0.9715526 ],
       [0.2002849 , 0.98433703],
       [0.21371666, 0.98302376],
       [0.23757464, 0.98058635],
       [0.32763398, 0.96996975],
       [0.22710988, 0.9816725 ],
       [0.26464838, 0.97764504],
       [0.34724212, 0.9673065 ],
       [0.27716213, 0.9762175 ],
       [0.19957006, 0.98440576],
       [0.19552666, 0.98479253],
       [0.20542544, 0.9838392 ],
       [0.2040357 , 0.9839744 ],
       [0.6550251 , 0.89234793],
       [0.3626784 , 0.96510553],
       [0.19957006, 0.98440576],
       [0.

In [151]:
pred_onnx[0:10],  y_pred[0:10]

(array([0.9839077 , 0.98496807, 0.9633797 , 0.97144794, 0.9827354 ,
        0.55532795, 0.97169316, 0.97817373, 0.79138005, 0.9165348 ],
       dtype=float32),
 array([0.01531868, 0.01559424, 0.02791771, 0.02855204, 0.01588213,
        0.43792225, 0.11243358, 0.01969074, 0.19371337, 0.42989881]))

In [153]:
0.20472258 + 0.9839077 

1.18863028