In [2]:
import shap
import optuna
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import gc
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation, reset_parameter
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import numpy as np
from time import time
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
import joblib

In [3]:
# train.parquet
!gdown 'https://drive.google.com/uc?id=11nez3AXDcvc9RAekNz1C_DRIR8zbOX7x'
# test.parquet
!gdown 'https://drive.google.com/uc?id=1CgRRgcoKaaV8oYeTBxGAXMR1gGnANG9A'
# model lgbm
!gdown 'https://drive.google.com/uc?id=11oSGZsR0l9vNr182wBO2mGuyrDqUsV91'
# model random forest
!gdown 'https://drive.google.com/uc?id=1-Ar7VUIVzANAMaBqFyax0LWaRX3hIkUu'
# model xgb
!gdown 'https://drive.google.com/uc?id=1k_rmJ0sQoS2IoXdBZOJfl8WUu1yZNZn_'

Downloading...
From (original): https://drive.google.com/uc?id=11nez3AXDcvc9RAekNz1C_DRIR8zbOX7x
From (redirected): https://drive.google.com/uc?id=11nez3AXDcvc9RAekNz1C_DRIR8zbOX7x&confirm=t&uuid=f07f1186-921f-4dcf-85f7-b3fa8ef86f97
To: /kaggle/working/train.parquet
100%|████████████████████████████████████████| 347M/347M [00:21<00:00, 16.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1CgRRgcoKaaV8oYeTBxGAXMR1gGnANG9A
To: /kaggle/working/test.parquet
100%|██████████████████████████████████████| 19.3M/19.3M [00:01<00:00, 11.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=11oSGZsR0l9vNr182wBO2mGuyrDqUsV91
To: /kaggle/working/model_lgbm.pickle
100%|██████████████████████████████████████| 17.6M/17.6M [00:01<00:00, 12.3MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1-Ar7VUIVzANAMaBqFyax0LWaRX3hIkUu
From (redirected): https://drive.google.com/uc?id=1-Ar7VUIVzANAMaBqFyax0LWaRX3hIkUu&confirm=t&uuid=f8f51bff-7a2d-4a9c-a1eb-cbad9ccd5ace
To: /kaggle/wo

In [4]:
# Loading models
model_lgbm = joblib.load("model_lgbm.pickle")
model_xgb = joblib.load("model_xgb.pickle")
model_rdf = joblib.load("model_rdf.pickle")

  Loading from a raw memory buffer (like pickle in Python, RDS in R) on a CPU-only
  machine. Consider using `save_model/load_model` instead. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.  Changing `tree_method` to `hist`.


In [5]:
df_train = pd.read_parquet("train.parquet")
df_train

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag,flag
0,1678548,3,8,7,17,16,9,1,9,2,...,0,0,1,1,3,2,1,0,0,0
1,2834188,11,3,2,0,7,14,8,2,5,...,0,0,1,1,2,4,1,0,0,0
2,811902,11,9,6,11,13,14,8,2,5,...,3,3,4,1,2,4,1,0,0,0
3,836450,1,16,0,13,0,4,9,5,2,...,3,3,4,1,3,4,1,0,0,0
4,1769024,15,9,9,4,8,1,11,1,2,...,3,3,4,1,2,4,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17659828,2103608,4,1,9,4,8,1,11,10,2,...,3,3,4,1,2,3,1,1,1,0
17659829,2169551,7,15,4,12,10,6,13,15,2,...,0,0,1,1,3,2,1,0,0,0
17659830,1613436,1,8,9,16,5,9,9,10,3,...,3,3,4,1,3,4,1,0,0,0
17659831,1604231,6,17,13,12,1,2,4,2,2,...,0,0,1,1,3,0,1,0,0,0


In [6]:
X_train, X_val, y_train, y_val = train_test_split(df_train.drop(["flag"],axis = 1), df_train.flag, stratify=df_train.flag, test_size=0.05, random_state = 42)
#del df_train
#_= gc.collect()

In [7]:
# Prédictions sur le jeu d'entraînement
train_pred_xgb = model_xgb.predict_proba(X_train)[:, 1]

In [8]:
train_auc_xgb = roc_auc_score(y_train, train_pred_xgb)
print(f"Train AUC: {train_auc_xgb}\n")

Train AUC: 0.7622346485240862



In [9]:
train_pred_lgbm = model_lgbm.predict_proba(X_train)[:, 1]

In [10]:
train_auc_lgbm = roc_auc_score(y_train, train_pred_lgbm)
print(f"Train AUC: {train_auc_lgbm}\n")

Train AUC: 0.6973139573473583



In [11]:
train_pred_rdf = model_rdf.predict_proba(X_train)[:, 1]

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   41.2s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed:  6.9min
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed: 11.7min finished


In [12]:
train_auc_rdf = roc_auc_score(y_train, train_pred_rdf)
print(f"Train AUC: {train_auc_rdf}\n")

Train AUC: 0.6287516433066563



In [13]:
# Prédictions sur le jeu de test
test_pred_xgb = model_xgb.predict_proba(X_val)[:, 1]

In [14]:
test_pred_lgbm = model_lgbm.predict_proba(X_val)[:, 1]

In [15]:
test_pred_rdf = model_rdf.predict_proba(X_val)[:, 1]

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:    8.1s
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed:   18.5s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:   31.1s finished


In [16]:
test_auc_lgbm = roc_auc_score(y_val, test_pred_lgbm)
test_auc_xgb = roc_auc_score(y_val, test_pred_xgb)
test_auc_rdf = roc_auc_score(y_val, test_pred_rdf)
print(f"Test AUC LGBM: {test_auc_lgbm}\n Test AUC XGB: {test_auc_xgb}\n Test AUC RDF: {test_auc_rdf}")

Test AUC LGBM: 0.6990493443942094
 Test AUC XGB: 0.6673831795810793
 Test AUC RDF: 0.6270050933079852


In [17]:
X_train_meta = pd.DataFrame({
    "pred_xgb": train_pred_xgb,
    "pred_lgbm": train_pred_lgbm,
    "pred_rdf" : train_pred_rdf
})

X_test_meta = pd.DataFrame({
    "pred_xgb": test_pred_xgb,
    "pred_lgbm": test_pred_lgbm,
    "pred_rdf" : test_pred_rdf
})


In [18]:
X_train_meta

Unnamed: 0,pred_xgb,pred_lgbm,pred_rdf
0,0.505267,0.449046,0.494152
1,0.360627,0.354144,0.413882
2,0.519545,0.502707,0.526026
3,0.569734,0.625406,0.522870
4,0.410131,0.380943,0.446467
...,...,...,...
16776836,0.600328,0.643313,0.539328
16776837,0.317662,0.310273,0.371034
16776838,0.581390,0.592015,0.553295
16776839,0.439044,0.444540,0.444591


In [19]:
X_train_meta.isna().sum()

pred_xgb     0
pred_lgbm    0
pred_rdf     0
dtype: int64

In [20]:
y_train_meta = y_train.reset_index(drop = True)
y_train_meta

0           0
1           0
2           0
3           0
4           0
           ..
16776836    0
16776837    0
16776838    0
16776839    0
16776840    0
Name: flag, Length: 16776841, dtype: int8

In [21]:
y_test_meta = y_val.reset_index(drop = True)
y_test_meta

0         0
1         0
2         0
3         1
4         0
         ..
882987    0
882988    0
882989    0
882990    0
882991    0
Name: flag, Length: 882992, dtype: int8

In [23]:
start = time()
meta_model = SGDClassifier(
    loss="log_loss",        # régression logistique
    penalty="elasticnet",   # combinaison L1 + L2
    alpha=0.01,            # force de régularisation
    l1_ratio=0.01,          # proportion de L1 dans le mélange
    max_iter=1000,
    tol=1e-3,
    random_state=42
)
meta_model.fit(X_train_meta, y_train_meta)
end = time()

print(f"Duration: {end - start}")

Duration: 26.070841073989868


In [24]:
"""
# Définition du modèle de base
sgd = SGDClassifier(
    loss="log_loss",         # régression logistique
    penalty="elasticnet",    # régularisation mixte L1 + L2
    max_iter=1000,
    tol=1e-3,
    random_state=42
)

# Grille des hyperparamètres à tester
param_grid = {
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1],   # force de régularisation
    'l1_ratio': [0.0, 0.15, 0.5, 0.9, 1.0]  # proportion de L1 dans le mélange
}

# Début du chronométrage
start = time()

# Définition de la recherche
grid_search = GridSearchCV(
    estimator=sgd,
    param_grid=param_grid,
    scoring='accuracy',   # ou 'f1', 'roc_auc', selon ton problème
    cv=5,                 # validation croisée à 5 plis
    n_jobs=-1,            # parallélisation complète
    verbose=1
)

# Entraînement
grid_search.fit(X_train_meta, y_train_meta)

end = time()

# Résultats
print(f"Duration: {end - start:.2f} sec")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-val score: {grid_search.best_score_:.4f}")
"""

'\n# Définition du modèle de base\nsgd = SGDClassifier(\n    loss="log_loss",         # régression logistique\n    penalty="elasticnet",    # régularisation mixte L1 + L2\n    max_iter=1000,\n    tol=1e-3,\n    random_state=42\n)\n\n# Grille des hyperparamètres à tester\nparam_grid = {\n    \'alpha\': [1e-4, 1e-3, 1e-2, 1e-1],   # force de régularisation\n    \'l1_ratio\': [0.0, 0.15, 0.5, 0.9, 1.0]  # proportion de L1 dans le mélange\n}\n\n# Début du chronométrage\nstart = time()\n\n# Définition de la recherche\ngrid_search = GridSearchCV(\n    estimator=sgd,\n    param_grid=param_grid,\n    scoring=\'accuracy\',   # ou \'f1\', \'roc_auc\', selon ton problème\n    cv=5,                 # validation croisée à 5 plis\n    n_jobs=-1,            # parallélisation complète\n    verbose=1\n)\n\n# Entraînement\ngrid_search.fit(X_train_meta, y_train_meta)\n\nend = time()\n\n# Résultats\nprint(f"Duration: {end - start:.2f} sec")\nprint(f"Best parameters: {grid_search.best_params_}")\nprint(f"B

In [25]:
train_preds_meta = meta_model.predict_proba(X_train_meta)[:, 1]

In [26]:
train_auc_meta = roc_auc_score(y_train_meta, train_preds_meta)
print(f"Train AUC meta: {train_auc_meta}\n")

Train AUC meta: 0.733061138116982



In [27]:
test_preds_meta = meta_model.predict_proba(X_test_meta)[:, 1]

In [28]:
test_auc_meta = roc_auc_score(y_test_meta, test_preds_meta)
print(f"Test AUC meta: {test_auc_meta}\n")

Test AUC meta: 0.6816909589718987



In [29]:
import joblib

In [30]:
joblib.dump(meta_model, "meta_model.pickle")

['meta_model.pickle']

In [31]:
#test_df_ready = pd.read_parquet("/kaggle/input/data-tour-new/test_df_ready.parquet")

In [32]:
#test_preds_proba = model_xgb.predict_proba(test_df_ready[X_train.columns], iteration_range=(0, model_xgb.best_iteration))[:, 1]

In [33]:
#test_preds_proba_df = pd.DataFrame({"target": test_preds_proba})
#submission_df = pd.concat([test_df_ready ["id_x_rn"], test_preds_proba_df], axis = 1)
#submission_df.rename(columns={"id_x_rn": "id"}, inplace=True)
#submission_df

In [34]:
#submission_df.to_parquet("sample_submission.parquet")

In [35]:
"""
start = time()
model_lgbm = lgb.LGBMClassifier(boosting_type= 'goss', num_leaves= 128, max_depth= 7, 
                                learning_rate= 0.05, n_estimators= 3000,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.3,
    reg_lambda=2,
    min_child_samples=30,
    class_weight=(1 / y_test_meta.value_counts(normalize=True)).to_dict(),
    random_state=42, verbose = -1)

model_lgbm.fit(
    X_train_meta, y_train_meta,
    eval_set=[(X_test_meta, y_test_meta)],
    eval_metric='auc',
    callbacks=[
        early_stopping(stopping_rounds=150),
        log_evaluation(period=100)
    ]
)
end = time()

print(f"Duration: {end - start}")
"""

'\nstart = time()\nmodel_lgbm = lgb.LGBMClassifier(boosting_type= \'goss\', num_leaves= 128, max_depth= 7, \n                                learning_rate= 0.05, n_estimators= 3000,\n    subsample=0.9,\n    colsample_bytree=0.9,\n    reg_alpha=0.3,\n    reg_lambda=2,\n    min_child_samples=30,\n    class_weight=(1 / y_test_meta.value_counts(normalize=True)).to_dict(),\n    random_state=42, verbose = -1)\n\nmodel_lgbm.fit(\n    X_train_meta, y_train_meta,\n    eval_set=[(X_test_meta, y_test_meta)],\n    eval_metric=\'auc\',\n    callbacks=[\n        early_stopping(stopping_rounds=150),\n        log_evaluation(period=100)\n    ]\n)\nend = time()\n\nprint(f"Duration: {end - start}")\n'