In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier

# 1. Chargement
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
test_ids = test['id']

# 2. Feature Engineering (Train & Test)
for df in [train, test]:
    df['cholesterol_ratio'] = df['ldl_cholesterol'] / df['hdl_cholesterol']
    df['mean_blood_pressure'] = (df['systolic_bp'] + df['diastolic_bp']) / 2

# 3. Encodage Ordinal
income_map = {'Low': 1, 'Lower-Middle': 2, 'Middle': 3, 'Upper-Middle': 4, 'High': 5}
edu_map = {'No formal': 0, 'Highschool': 1, 'Graduate': 2, 'Postgraduate': 3}
smoke_map = {'Never': 0, 'Current': 1, 'Former': 2}

for df in [train, test]:
    df['income_level'] = df['income_level'].map(income_map)
    df['education_level'] = df['education_level'].map(edu_map)
    df['smoking_status'] = df['smoking_status'].map(smoke_map)

# 4. One-Hot Encoding
nominal_cols = ['gender', 'ethnicity', 'employment_status']
train = pd.get_dummies(train, columns=nominal_cols, drop_first=True, dtype=int)
test = pd.get_dummies(test, columns=nominal_cols, drop_first=True, dtype=int)

# 5. Séparation X/y et Alignement
X_train = train.drop(columns=['id', 'diagnosed_diabetes'])
y_train = train['diagnosed_diabetes']
X_test = test.drop(columns=['id']).reindex(columns=X_train.columns, fill_value=0)

# 6. Standardisation (Fit sur Train uniquement)
num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# 7. Modèle Final (Vos meilleurs paramètres)
model = GradientBoostingClassifier(
    learning_rate=0.25, 
    n_estimators=300, 
    max_depth=3, 
    subsample=0.8, 
    random_state=42
)
model.fit(X_train, y_train)

# 8. Création du fichier CSV
probs = model.predict_proba(X_test)[:, 1]
submission = pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': probs})
submission.to_csv('submission2.csv', index=False)

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# 1. Chargement des données
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
test_ids = df_test['id']

# 2. Nettoyage des Outliers (Clipping pour limiter l'impact des valeurs extrêmes)
# On limite le BMI à 60 et la pression systolique à 200 pour éviter le bruit
for df in [df_train, df_test]:
    df['bmi'] = df['bmi'].clip(upper=60)
    df['systolic_bp'] = df['systolic_bp'].clip(upper=200)

# 3. Feature Engineering
for df in [df_train, df_test]:
    df['cholesterol_ratio'] = df['ldl_cholesterol'] / (df['hdl_cholesterol'] + 0.001)
    df['mean_blood_pressure'] = (df['systolic_bp'] + df['diastolic_bp']) / 2
    # Bonus : Indicateur d'activité physique (Minutes / Age)
    df['activity_intensity'] = df['physical_activity_minutes_per_week'] / (df['age'] + 1)

# 4. Encodage Ordinal
income_map = {'Low': 1, 'Lower-Middle': 2, 'Middle': 3, 'Upper-Middle': 4, 'High': 5}
edu_map = {'No formal': 0, 'Highschool': 1, 'Graduate': 2, 'Postgraduate': 3}
smoke_map = {'Never': 0, 'Current': 1, 'Former': 2}

for df in [df_train, df_test]:
    df['income_level'] = df['income_level'].map(income_map)
    df['education_level'] = df['education_level'].map(edu_map)
    df['smoking_status'] = df['smoking_status'].map(smoke_map)

# 5. One-Hot Encoding
nominal_cols = ['gender', 'ethnicity', 'employment_status']
df_train = pd.get_dummies(df_train, columns=nominal_cols, drop_first=True, dtype=int)
df_test = pd.get_dummies(df_test, columns=nominal_cols, drop_first=True, dtype=int)

# 6. Préparation des colonnes et division
X = df_train.drop(columns=['id', 'diagnosed_diabetes'])
y = df_train['diagnosed_diabetes']

# Aligner le test sur le train (colonnes identiques)
X_test_final = df_test.drop(columns=['id']).reindex(columns=X.columns, fill_value=0)

# Split pour validation locale (20%)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 7. Standardisation stricte
num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val[num_cols] = scaler.transform(X_val[num_cols])
X_test_final[num_cols] = scaler.transform(X_test_final[num_cols])

# 8. Modèle robuste (Paramètres anti-overfitting)
# On réduit le learning rate et on augmente les arbres
stable_gb = GradientBoostingClassifier(
    learning_rate=0.05,  # Plus lent = plus stable
    n_estimators=800,    # Plus d'arbres pour compenser la lenteur
    max_depth=3,         # On garde des arbres simples
    subsample=0.8,       # Utilise 80% des données par arbre (Stochastic)
    max_features='sqrt', # Sélectionne un sous-ensemble de variables pour chaque split
    random_state=42
)

print("Entraînement en cours...")
stable_gb.fit(X_train, y_train)

# 9. Validation locale
val_probs = stable_gb.predict_proba(X_val)[:, 1]
print(f"Nouveau score AUC-ROC Local : {roc_auc_score(y_val, val_probs):.5f}")

# 10. Génération du fichier pour Kaggle
test_probs = stable_gb.predict_proba(X_test_final)[:, 1]
submission = pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': test_probs})
submission.to_csv('submission3.csv', index=False)

Entraînement en cours...
Nouveau score AUC-ROC Local : 0.70926


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold

# 1. Chargement
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
test_ids = test['id']

# 2. Feature Engineering minimal (les deux qui marchaient bien)
for df in [train, test]:
    df['cholesterol_ratio'] = df['ldl_cholesterol'] / (df['hdl_cholesterol'] + 0.001)
    df['mean_blood_pressure'] = (df['systolic_bp'] + df['diastolic_bp']) / 2

# 3. Encodage express
train['income_level'] = train['income_level'].map({'Low':1,'Lower-Middle':2,'Middle':3,'Upper-Middle':4,'High':5})
test['income_level'] = test['income_level'].map({'Low':1,'Lower-Middle':2,'Middle':3,'Upper-Middle':4,'High':5})
# On simplifie : on ne garde que les numériques et l'income pour ce test
features = train.select_dtypes(include=[np.number]).columns.drop(['id', 'diagnosed_diabetes']).tolist()

X = train[features]
y = train['diagnosed_diabetes']
X_test = test[features]

# 4. Cross-Validation (5-Folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_preds = np.zeros(len(X_test))
val_scores = []

print("Démarrage de la Cross-Validation...")
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]
    
    # Standardisation
    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X_tr)
    X_va = scaler.transform(X_va)
    X_te = scaler.transform(X_test)
    
    # Modèle équilibré
    model = GradientBoostingClassifier(n_estimators=300, learning_rate=0.1, max_depth=3, random_state=42)
    model.fit(X_tr, y_tr)
    
    # Accumulation des prédictions
    test_preds += model.predict_proba(X_te)[:, 1] / 5
    print(f"Fold {fold+1} terminé.")

# 5. Soumission
submission = pd.DataFrame({'id': test_ids, 'diagnosed_diabetes': test_preds})
submission.to_csv('submission4.csv', index=False)


Démarrage de la Cross-Validation...
Fold 1 terminé.
Fold 2 terminé.
Fold 3 terminé.
Fold 4 terminé.
Fold 5 terminé.


In [5]:

import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# =====================
# Load data
# =====================
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample_sub = pd.read_csv('data/sample_submission.csv')

TARGET = 'diagnosed_diabetes'
ID_COL = 'id'

X = train.drop(columns=[TARGET])
y = train[TARGET]

# =====================
# Identify categorical features
# =====================
cat_features = X.select_dtypes(include=['object']).columns.tolist()
cat_feature_indices = [X.columns.get_loc(col) for col in cat_features]

# =====================
# Cross-validation to validate AUC
# =====================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = []

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(test))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = CatBoostClassifier(
        iterations=2000,
        learning_rate=0.03,
        depth=8,
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=42,
        l2_leaf_reg=5,
        subsample=0.8,
        colsample_bylevel=0.8,
        early_stopping_rounds=200,
        verbose=200
    )

    model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        cat_features=cat_feature_indices,
        use_best_model=True
    )

    val_preds = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, val_preds)
    auc_scores.append(auc)
    oof_preds[val_idx] = val_preds

    test_preds += model.predict_proba(test)[:, 1] / skf.n_splits

    print(f"Fold {fold + 1} AUC: {auc:.5f}")

print("\nMean CV AUC:", np.mean(auc_scores))

# =====================
# Train final model on full data
# =====================
final_model = CatBoostClassifier(
    iterations=int(np.mean([model.best_iteration_ for _ in range(1)])),
    learning_rate=0.03,
    depth=8,
    loss_function='Logloss',
    eval_metric='AUC',
    random_seed=42,
    l2_leaf_reg=5,
    subsample=0.8,
    colsample_bylevel=0.8,
    verbose=200
)

final_model.fit(X, y, cat_features=cat_feature_indices)

final_test_preds = final_model.predict_proba(test)[:, 1]

# =====================
# Create submission
# =====================
submission = sample_sub.copy()
submission[TARGET] = final_test_preds
submission.to_csv('submission5.csv', index=False)



0:	test: 0.6843221	best: 0.6843221 (0)	total: 352ms	remaining: 11m 43s
200:	test: 0.7087498	best: 0.7087498 (200)	total: 48s	remaining: 7m 9s
400:	test: 0.7137586	best: 0.7137586 (400)	total: 1m 39s	remaining: 6m 35s
600:	test: 0.7200688	best: 0.7200688 (600)	total: 2m 34s	remaining: 5m 58s
800:	test: 0.7225045	best: 0.7225045 (800)	total: 3m 33s	remaining: 5m 19s
1000:	test: 0.7239158	best: 0.7239158 (1000)	total: 4m 32s	remaining: 4m 31s
1200:	test: 0.7249532	best: 0.7249532 (1200)	total: 5m 29s	remaining: 3m 38s
1400:	test: 0.7255281	best: 0.7255322 (1399)	total: 6m 26s	remaining: 2m 45s
1600:	test: 0.7259988	best: 0.7260014 (1594)	total: 7m 26s	remaining: 1m 51s
1800:	test: 0.7263739	best: 0.7263773 (1798)	total: 8m 27s	remaining: 56.1s
1999:	test: 0.7266647	best: 0.7266652 (1998)	total: 9m 25s	remaining: 0us

bestTest = 0.7266652358
bestIteration = 1998

Shrink model to first 1999 iterations.
Fold 1 AUC: 0.72667
0:	test: 0.6807939	best: 0.6807939 (0)	total: 333ms	remaining: 11m 5s

In [9]:
# Ultra-optimized AUC model (Leaderboard-oriented)
# Techniques used:
# - CatBoost with ordered boosting
# - Strong regularization
# - CV-based optimal iteration selection
# - Test-time averaging

import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# =====================
# Load data
# =====================
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample_sub = pd.read_csv('data/sample_submission.csv')

TARGET = 'diagnosed_diabetes'
ID_COL = 'id'

X = train.drop(columns=[TARGET])
y = train[TARGET]

# =====================
# Categorical features
# =====================
cat_features = X.select_dtypes(include=['object']).columns.tolist()
cat_feature_indices = [X.columns.get_loc(c) for c in cat_features]

# =====================
# CV setup
# =====================
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

oof = np.zeros(len(X))
test_preds = np.zeros(len(test))
best_iterations = []
auc_scores = []

for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y)):
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

    train_pool = Pool(X_tr, y_tr, cat_features=cat_feature_indices)
    val_pool = Pool(X_val, y_val, cat_features=cat_feature_indices)

    model = CatBoostClassifier(
        iterations=6000,
        learning_rate=0.015,
        depth=9,
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=42,
        l2_leaf_reg=10,
        colsample_bylevel=0.85,
        boosting_type='Ordered',
        bootstrap_type='Bayesian',
        bagging_temperature=0.5,
        min_data_in_leaf=50,
        grow_policy='SymmetricTree',
        early_stopping_rounds=400,
        verbose=False
    )

    model.fit(train_pool, eval_set=val_pool, use_best_model=True)

    val_pred = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, val_pred)

    auc_scores.append(auc)
    best_iterations.append(model.best_iteration_)
    oof[val_idx] = val_pred

    test_preds += model.predict_proba(test)[:, 1] / skf.n_splits

    print(f"Fold {fold + 1} | AUC = {auc:.6f} | Best iter = {model.best_iteration_}")

print("\nMean CV AUC:", np.mean(auc_scores))
print("Mean best iteration:", int(np.mean(best_iterations)))

# =====================
# Final model (full data)
# =====================
final_model = CatBoostClassifier(
    iterations=int(np.mean(best_iterations)),
    learning_rate=0.015,
    depth=9,
    loss_function='Logloss',
    eval_metric='AUC',
    random_seed=42,
    l2_leaf_reg=10,
    colsample_bylevel=0.85,
    boosting_type='Ordered',
    bootstrap_type='Bayesian',
    bagging_temperature=0.5,
    min_data_in_leaf=50,
    grow_policy='SymmetricTree',
    verbose=200
)

final_model.fit(X, y, cat_features=cat_feature_indices)

final_test_preds = final_model.predict_proba(test)[:, 1]

# =====================
# Submission
# =====================
submission = sample_sub.copy()
submission[TARGET] = final_test_preds
submission.to_csv('submission6.csv', index=False)

print("Ultra-optimized submission saved ")


KeyboardInterrupt: 

In [None]:
# FAST & STRONG CatBoost model (≈95% of max AUC, 4x faster)
# Competition-safe configuration

import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# =====================
# Load data
# =====================
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_sub = pd.read_csv('sample_submission.csv')

TARGET = 'diagnosed_diabetes'

X = train.drop(columns=[TARGET])
y = train[TARGET]

# =====================
# Categorical features
# =====================
cat_features = X.select_dtypes(include=['object']).columns.tolist()
cat_feature_indices = [X.columns.get_loc(c) for c in cat_features]

# =====================
# CV setup (FAST)
# =====================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

test_preds = np.zeros(len(test))
auc_scores = []

for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y)):
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

    train_pool = Pool(X_tr, y_tr, cat_features=cat_feature_indices)
    val_pool = Pool(X_val, y_val, cat_features=cat_feature_indices)

    model = CatBoostClassifier(
        iterations=3000,
        learning_rate=0.02,
        depth=8,
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=42,
        l2_leaf_reg=7,
        boosting_type='Ordered',
        bootstrap_type='Bayesian',
        bagging_temperature=0.3,
        min_data_in_leaf=40,
        early_stopping_rounds=200,
        verbose=200
    )

    model.fit(train_pool, eval_set=val_pool, use_best_model=True)

    val_pred = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, val_pred)
    auc_scores.append(auc)

    test_preds += model.predict_proba(test)[:, 1] / skf.n_splits

    print(f"Fold {fold + 1} AUC: {auc:.6f}")

print("\nMean CV AUC:", np.mean(auc_scores))

# =====================
# Submission
# =====================
submission = sample_sub.copy()
submission[TARGET] = test_preds
submission.to_csv('submission.csv', index=False)

print("FAST submission saved ✔")
