# Bagging

In [1]:
import pandas as pd

credit2 = pd.read_csv('credit2.csv') 

pd.set_option('display.max_columns', None)  # Ajout pour afficher toutes les colonnes

test_size = 0.3

shuffled_data = credit2.sample(frac=1, random_state=42)

split_point = int(len(shuffled_data) * (1 - test_size))

train = shuffled_data.iloc[:split_point].reset_index().rename(columns={'index': 'id'})
test = shuffled_data.iloc[split_point:].reset_index().rename(columns={'index': 'id'})

In [2]:
train

Unnamed: 0,id,accounts,history_credit,object_credit,savings,employment_old,effort_rate,family_status,guarantees,home_old,property,other_credits,home_status,nb_credits,job_type,nb_of_dependants,telephone,age,duration_credit,amount_credit,presence_unpaid
0,521,CC < 0 euros,A32,Video-HIFI,< 500 euros,between 1 and 4 years,2,Female divorced/separated/married,Without guarantor,2,Property,No external credit,owner,1,A173,1,A191,"[0.0, 25.0)","[15.0, 36.0)","[0.0, 4000.0)",1
1,737,CC < 0 euros,A32,New car,< 500 euros,between 1 and 4 years,3,Male single/married/widowed,Without guarantor,4,Non-property,No external credit,owner,1,A172,2,A192,"[25.0, inf)","[15.0, 36.0)","[4000.0, inf)",0
2,740,CC < 0 euros,A31,New car,< 500 euros,for at least 4 years,2,Male single/married/widowed,Without guarantor,3,Non-property,External credits,owner,1,A173,1,A191,"[25.0, inf)","[15.0, 36.0)","[0.0, 4000.0)",0
3,660,CC > 200 euros,A32,Video-HIFI,< 500 euros,between 1 and 4 years,3,Male single/married/widowed,Without guarantor,4,Property,No external credit,Not owner,1,A173,1,A191,"[0.0, 25.0)","[0.0, 15.0)","[0.0, 4000.0)",0
4,411,No account,A34,Used car,< 500 euros,for at least 4 years,3,Male single/married/widowed,Without guarantor,2,Non-property,No external credit,owner,2,A174,1,A192,"[25.0, inf)","[15.0, 36.0)","[4000.0, inf)",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,262,CC < 0 euros,A34,New car,< 500 euros,for at least 4 years,2,Male single/married/widowed,Without guarantor,4,No property,No external credit,Not owner,3,A174,1,A192,"[25.0, inf)","[15.0, 36.0)","[4000.0, inf)",0
696,610,CC < 0 euros,A32,Interior,< 500 euros,Unemployed or < 1 year,4,Female divorced/separated/married,Without guarantor,3,Not real estate,No external credit,owner,1,A173,1,A191,"[0.0, 25.0)","[0.0, 15.0)","[0.0, 4000.0)",1
697,297,No account,A32,New car,No savings,for at least 4 years,4,Male single/married/widowed,Without guarantor,2,Not real estate,No external credit,owner,1,A172,1,A191,"[25.0, inf)","[0.0, 15.0)","[0.0, 4000.0)",0
698,414,CC < 0 euros,A32,New car,No savings,between 1 and 4 years,4,Female divorced/separated/married,Without guarantor,2,Not real estate,No external credit,owner,1,A173,1,A191,"[25.0, inf)","[15.0, 36.0)","[0.0, 4000.0)",1


In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
import time

# Pr√©paration des donn√©es (suppos√© d√©j√† fait)
X_train = credit2.loc[id, vars]
y_train = credit2.loc[id, "presence_unpaid"]
X_test = test[vars]
y_test = test["presence_unpaid"]

# Initialisation : arbre sans √©lagage (maximal depth)
base_tree = DecisionTreeClassifier(
    criterion='gini',
    splitter='best',
    max_depth=None,  # arbres tr√®s profonds
    min_samples_split=2,
    min_samples_leaf=1,
    ccp_alpha=0,     # pas d‚Äô√©lagage (comme cp=0 en R)
    random_state=235
)

# Bagging avec 200 estimateurs (bootstrap aggregating)
bag = BaggingClassifier(
    base_estimator=base_tree,
    n_estimators=200,
    bootstrap=True,
    oob_score=True,
    random_state=235,
    n_jobs=-1
)

# Entra√Ænement
start = time.time()
bag.fit(X_train, y_train)
elapsed = time.time() - start
print(f"‚è±Ô∏è Temps d'entra√Ænement : {elapsed:.2f} sec")

# OOB Error (comme coob=TRUE en R)
print("‚úÖ OOB Score (1 - OOB error):", bag.oob_score_)

# Pr√©diction
test["bag"] = bag.predict_proba(X_test)[:, 1]

# AUC (√©quivalent ROCR::prediction/performance)
auc_score = roc_auc_score(y_test, test["bag"])
print("üìä AUC :", auc_score)


KeyError: 134469602251568

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from scipy.stats import mode
import numpy as np

# üîß 1. Arbre avec min_samples_leaf=5 (√©quivalent de minbucket=5)
base_tree_leaf5 = DecisionTreeClassifier(
    max_depth=None,
    min_samples_leaf=5,
    ccp_alpha=0,  # pas d‚Äô√©lagage
    random_state=235
)

# üß™ 2. Bagging avec 200 arbres
bag1 = BaggingClassifier(
    base_estimator=base_tree_leaf5,
    n_estimators=200,
    bootstrap=True,
    oob_score=True,
    random_state=235,
    n_jobs=-1
)

# üéØ Entra√Ænement
bag1.fit(X_train, y_train)

# üß™ Agr√©gation par moyenne des probabilit√©s (default)
probas_avg = bag1.predict_proba(X_test)[:, 1]

# AUC avec agr√©gation par moyenne
auc_avg = roc_auc_score(y_test, probas_avg)
print("üìä AUC (agr√©gation par moyenne) :", auc_avg)

# üß™ Agr√©gation par vote majoritaire
# predict_classes = majority vote sur chaque √©chantillon
votes = np.asarray([tree.predict(X_test) for tree in bag1.estimators_])
majority_vote = mode(votes, axis=0).mode[0]
# On doit convertir en probas (0.0 ou 1.0 pour AUC)
probas_majority = majority_vote.astype(float)

# AUC avec vote majoritaire
auc_maj = roc_auc_score(y_test, probas_majority)
print("üìä AUC (agr√©gation par vote majoritaire) :", auc_maj)


In [7]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Fixation de la graine al√©atoire pour la reproductibilit√©
np.random.seed(235)

# Supposons que 'credit' est votre DataFrame et 'id' contient les indices d'entra√Ænement
# 'vars' contient les noms des colonnes √† utiliser (incluant 'Cible')

# Pr√©paration des donn√©es
X_train = credit2.loc[id, [col for col in vars if col != 'presence_unpaid']]
y_train = credit2.loc[id, 'presence_unpaid']

# Cr√©ation d'un stump (arbre de d√©cision de profondeur 1)
stump = DecisionTreeClassifier(
    max_depth=1,
    min_samples_split=2,  # √©quivalent √† minsplit=0 en R (minimum 2 √©chantillons)
    min_samples_leaf=1
)

# Bagging de 100 stumps
bagging_stumps = BaggingClassifier(
    estimator=stump,
    n_estimators=100,
    random_state=235,
    oob_score=True,  # √©quivalent √† coob=TRUE
    bootstrap=True
)

# Entra√Ænement du mod√®le de bagging
bag1 = bagging_stumps.fit(X_train, y_train)

# Pour la comparaison : entra√Ænement d'un seul stump
single_stump = DecisionTreeClassifier(
    max_depth=1,
    min_samples_split=2,
    min_samples_leaf=1,
    criterion='gini'  # √©quivalent √† parms=list(split="gini")
)
single_stump.fit(X_train, y_train)

# Pr√©diction sur l'ensemble de test
# Supposons que 'test' est votre DataFrame de test
X_test = test[[col for col in vars if col != 'Cible']]
y_test = test['Cible']

# Probabilit√©s pr√©dites par le stump unique
test_stump_proba = single_stump.predict_proba(X_test)

# Calcul de l'AUC pour le stump unique
auc_stump = roc_auc_score(y_test, test_stump_proba[:, 1])
print(f"AUC du stump unique: {auc_stump:.4f}")

# Bonus : AUC pour le mod√®le de bagging
test_bagging_proba = bag1.predict_proba(X_test)
auc_bagging = roc_auc_score(y_test, test_bagging_proba[:, 1])
print(f"AUC du bagging de stumps: {auc_bagging:.4f}")

# Score OOB (Out-Of-Bag) si disponible
if hasattr(bag1, 'oob_score_'):
    print(f"Score OOB: {bag1.oob_score_:.4f}")

# Visualisation des courbes ROC (optionnel)
plt.figure(figsize=(8, 6))

# ROC pour le stump unique
fpr_stump, tpr_stump, _ = roc_curve(y_test, test_stump_proba[:, 1])
plt.plot(fpr_stump, tpr_stump, label=f'Stump unique (AUC = {auc_stump:.3f})')

# ROC pour le bagging
fpr_bag, tpr_bag, _ = roc_curve(y_test, test_bagging_proba[:, 1])
plt.plot(fpr_bag, tpr_bag, label=f'Bagging de stumps (AUC = {auc_bagging:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Al√©atoire')
plt.xlabel('Taux de Faux Positifs')
plt.ylabel('Taux de Vrais Positifs')
plt.title('Courbes ROC - Comparaison Stump vs Bagging')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

TypeError: 'builtin_function_or_method' object is not iterable