In [1]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

# -------------------------------------------------
# Load data
# -------------------------------------------------
train = pd.read_csv("/kaggle/input/competitions/WiDSWorldWide_GlobalDathon26/train.csv")
test = pd.read_csv("/kaggle/input/competitions/WiDSWorldWide_GlobalDathon26/test.csv")

FEATURES = [c for c in train.columns 
            if c not in ["event_id", "time_to_hit_hours", "event"]]

X = train[FEATURES]
X_test = test[FEATURES]

In [3]:
# -------------------------------------------------
# Function to train one horizon model
# -------------------------------------------------
def train_horizon_model(horizon):
    print(f"\nTraining for {horizon}h")

    if horizon == 72:
        y = train["event"].astype(int)
    else:
        y = ((train["event"] == 1) & 
             (train["time_to_hit_hours"] <= horizon)).astype(int)

    X_h = X.copy()
    y_h = y.copy()
# def train_horizon_model(horizon):
#     print(f"\nTraining for {horizon}h")
    
#     # ✅ NEW LABEL DEFINITION (keep all rows)
#     y = ((train["event"] == 1) & 
#          (train["time_to_hit_hours"] <= horizon)).astype(int)
    
#     X_h = X.copy()
#     y_h = y.copy()
    
    print("Class balance:", y_h.value_counts())
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    oof = np.zeros(len(X_h))
    test_preds = np.zeros(len(X_test))
    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "learning_rate": 0.03,
        "num_leaves": 15,
        "min_data_in_leaf": 10,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "verbosity": -1
    }
    
    for fold, (tr_idx, val_idx) in enumerate(skf.split(X_h, y_h)):
        X_tr, X_val = X_h.iloc[tr_idx], X_h.iloc[val_idx]
        y_tr, y_val = y_h.iloc[tr_idx], y_h.iloc[val_idx]
        
        train_data = lgb.Dataset(X_tr, label=y_tr)
        
        model = lgb.train(
            params,
            train_data,
            num_boost_round=800
        )
        oof[val_idx] = model.predict(X_val)
        test_preds += model.predict(X_test) / 5
    
    # ✅ Replace isotonic with Platt scaling (more stable)
    cal = LogisticRegression()
    cal.fit(oof.reshape(-1,1), y_h)
    test_preds = cal.predict_proba(test_preds.reshape(-1,1))[:,1]
    
    return np.clip(test_preds, 0, 1)
    

In [4]:
# -------------------------------------------------
# Train models
# -------------------------------------------------
prob12 = train_horizon_model(12)
prob24 = train_horizon_model(24)
prob48 = train_horizon_model(48)
prob72 = train_horizon_model(72)

# -------------------------------------------------
# Enforce monotonicity
# -------------------------------------------------
prob24 = np.maximum(prob24, prob12)
prob48 = np.maximum(prob48, prob24)
prob72 = np.maximum(prob72, prob48)

# -------------------------------------------------
# Submission
# -------------------------------------------------
submission = pd.DataFrame({
    "event_id": test["event_id"],
    "prob_12h": prob12,
    "prob_24h": prob24,
    "prob_48h": prob48,
    "prob_72h": prob72
})
submission.to_csv("submission.csv", index=False)
print("Submission file saved.")


Training for 12h
Class balance: 0    172
1     49
Name: count, dtype: int64

Training for 24h
Class balance: 0    158
1     63
Name: count, dtype: int64

Training for 48h
Class balance: 0    155
1     66
Name: count, dtype: int64

Training for 72h
Class balance: event
0    152
1     69
Name: count, dtype: int64
Submission file saved.


In [5]:
print(train["event"].value_counts())
print(train["event"].value_counts(normalize=True))

event
0    152
1     69
Name: count, dtype: int64
event
0    0.687783
1    0.312217
Name: proportion, dtype: float64
