In [1]:
# 1. Imports
import joblib
import pandas as pd
import numpy as np

# 2. Load the trained model and scaler
model = joblib.load('../models/lgbm_best_model.pkl')
scaler = joblib.load('../models/scaler.pkl')

# 3. Define new patient data (EXAMPLE)
# IMPORTANT: include all features used at train time, including all one-hot columns!
# For missing dummy columns, fill with 0 (the template handles this).
new_data = {
    'age': [60],
    'bmi': [28.1],
    'HbA1c_level': [6.2],
    'blood_glucose_level': [155],
    'hypertension': [0],
    'heart_disease': [0],
    # One-hot for gender (use your actual dummies from train!)
    'gender_Male': [1],      # 1 if Male, 0 otherwise
    'gender_Other': [0],     # 1 if Other, 0 otherwise
    # One-hot for smoking_history (based on your train columns)
    'smoking_history_ever': [0],
    'smoking_history_current': [0],
    'smoking_history_former': [1],
    'smoking_history_never': [0],
    'smoking_history_not current': [0],
}

# 4. Create DataFrame and align columns
X_new = pd.DataFrame(new_data)

# Asigură-te că ordinea și numele coloanelor corespund cu modelul!
model_features = model.feature_name_
X_new = X_new.reindex(columns=model_features, fill_value=0)

# 5. Apply the scaler (only on numeric columns)
numeric_cols = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
X_new[numeric_cols] = scaler.transform(X_new[numeric_cols])

# 6. Predict!
y_pred = model.predict(X_new)
y_proba = model.predict_proba(X_new)[:, 1]

print("Prediction (0=non-diabetic, 1=diabetic):", y_pred[0])
print("Predicted probability of diabetes:", y_proba[0])

# 7. (Optional) Test more cases in a loop or DataFrame!
# Example for batch predictions
batch = pd.DataFrame([
    {
        'age': 40, 'bmi': 23, 'HbA1c_level': 5.6, 'blood_glucose_level': 120,
        'hypertension': 0, 'heart_disease': 0, 'gender_Male': 0, 'gender_Other': 0,
        'smoking_history_ever': 1, 'smoking_history_current': 0,
        'smoking_history_former': 0, 'smoking_history_never': 0, 'smoking_history_not current': 0
    },
    {
        'age': 72, 'bmi': 32, 'HbA1c_level': 7.1, 'blood_glucose_level': 200,
        'hypertension': 1, 'heart_disease': 1, 'gender_Male': 1, 'gender_Other': 0,
        'smoking_history_ever': 0, 'smoking_history_current': 1,
        'smoking_history_former': 0, 'smoking_history_never': 0, 'smoking_history_not current': 0
    }
])
batch = batch.reindex(columns=model_features, fill_value=0)
batch[numeric_cols] = scaler.transform(batch[numeric_cols])
preds = model.predict(batch)
probas = model.predict_proba(batch)[:, 1]
print("\nBatch predictions:")
for i, (pred, proba) in enumerate(zip(preds, probas)):
    print(f"Row {i+1}: Prediction={pred}, Probability={proba:.3f}")


Prediction (0=non-diabetic, 1=diabetic): 1
Predicted probability of diabetes: 0.5568936056816312

Batch predictions:
Row 1: Prediction=0, Probability=0.227
Row 2: Prediction=1, Probability=1.000
