In [7]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score

In [9]:
df=pd.read_csv("Default.csv")
df

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.134700
2,No,No,1073.549164,31767.138950
3,No,No,529.250605,35704.493940
4,No,No,785.655883,38463.495880
...,...,...,...,...
9995,No,No,711.555020,52992.378910
9996,No,No,757.962918,19660.721770
9997,No,No,845.411989,58636.156980
9998,No,No,1569.009053,36669.112360


In [11]:
X = df.drop(columns='default')  
y = df['default']

numerical_cols = ['balance', 'income']
categorical_cols = ['student']

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

kf = KFold(n_splits=10, shuffle=True, random_state=50)

auc_scores = []

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    model.fit(X_train_fold, y_train_fold)  
    y_val_pred = model.predict_proba(X_val_fold)[:, 1]  
    auc = roc_auc_score(y_val_fold, y_val_pred)  
    auc_scores.append(auc)

auc_df = pd.DataFrame({'Fold': range(1, len(auc_scores) + 1), 'AUC': auc_scores})

In [15]:
auc_df.mean()

Fold    5.500000
AUC     0.653724
dtype: float64

In [17]:
auc_df

Unnamed: 0,Fold,AUC
0,1,0.746299
1,2,0.640909
2,3,0.682618
3,4,0.617987
4,5,0.716347
5,6,0.61928
6,7,0.674189
7,8,0.592569
8,9,0.617749
9,10,0.629294


In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [51]:
modelo = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('clasificador', RandomForestClassifier(random_state=50))
])
parametros = {
    'clasificador__n_estimators': [5, 40],
    'clasificador__max_depth': [2, 3, 4],
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

search = GridSearchCV(
    estimator=modelo,
    param_grid=parametros,
    scoring='roc_auc',
    cv=10,  
    n_jobs=-1, 
    verbose=1
)

search.fit(X_train, y_train)

resultados = pd.DataFrame(search.cv_results_)
auc_df = resultados[['param_clasificador__n_estimators', 'param_clasificador__max_depth',
                     'mean_test_score']].rename(columns={'mean_test_score': 'AUC'})

mejor_modelo = search.best_estimator_

y_test_pred = mejor_modelo.predict_proba(X_test)[:, 1]
auc_test = roc_auc_score(y_test, y_test_pred)

auc_df['AUC_conjunto_prueba'] = auc_test

media_auc = auc_df['AUC'].mean()

desviacion_auc = auc_df['AUC'].std()
media_auc,desviacion_auc

Fitting 10 folds for each of 6 candidates, totalling 60 fits


(0.901264815263645, 0.022442030292519028)

In [41]:
modelo = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('clasificador', RandomForestClassifier(
    n_estimators=40,
    max_depth=4,
    random_state=50
))
])
modelo.fit(X,y)

In [43]:
y_pred_prob = modelo.predict_proba(X)[:, 1] 

auc = roc_auc_score(y, y_pred_prob)

resultados_auc = pd.DataFrame({'Modelo': ['RandomForest'], 'AUC': [auc]})

resultados_auc

Unnamed: 0,Modelo,AUC
0,RandomForest,0.953417


In [49]:
RandomForestClassifier?

[1;31mInit signature:[0m
[0mRandomForestClassifier[0m[1;33m([0m[1;33m
[0m    [0mn_estimators[0m[1;33m=[0m[1;36m100[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mcriterion[0m[1;33m=[0m[1;34m'gini'[0m[1;33m,[0m[1;33m
[0m    [0mmax_depth[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_split[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_leaf[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0mmin_weight_fraction_leaf[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mmax_features[0m[1;33m=[0m[1;34m'sqrt'[0m[1;33m,[0m[1;33m
[0m    [0mmax_leaf_nodes[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_impurity_decrease[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mbootstrap[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0moob_score[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mn_jobs[0m[1;33m=[0m[1;32mN

In [53]:
mejores_parametros = search.best_params_

In [55]:
mejores_parametros

{'clasificador__max_depth': 4, 'clasificador__n_estimators': 40}