In [44]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from skopt import BayesSearchCV
from skopt.space import Integer

In [45]:
data = pd.read_csv("Default.csv")
data.head(3)

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.13895


In [46]:
model = RandomForestClassifier()

y = data["default"].map({'Yes': 1, 'No': 0})
data["student"] = data["student"].map({'Yes': 1, 'No': 0})

numerical_features = ["balance", "income"]
categorical_features = []

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
    ])

x = data.drop(columns = ["default"])

pipeline.fit(x, y)

In [47]:
cv = 10
scores = cross_val_score(pipeline, x, y, scoring = "roc_auc_ovr", cv = cv)
auc = scores
auc_mean = np.mean(auc)
auc_std = np.std(auc)

print(auc_mean, auc_std)

0.8869517093903628 0.01915931636232144


In [48]:
X_train, X_test, y_train, y_test = train_test_split(x, y)

search_space = {
    'model__n_estimators': Integer(5, 40),
    'model__max_depth': Integer(2, 4)
}

opt = BayesSearchCV(
        estimator=pipeline,
        search_spaces=search_space,
        n_iter=20,
        scoring='roc_auc',
        cv=3,
        random_state=2,
        verbose=0,
        n_jobs=-1
    )

opt.fit(X_train, y_train)

best_model = opt.best_estimator_
y_score = best_model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_score)
auc_scores_cv = opt.cv_results_['mean_test_score']
auc_var = np.var(auc_scores_cv)



In [49]:
print("Mejor AUC:", auc_score)
print("Varianza del AUC en CV:", auc_var)
print("Mejores parámetros:", opt.best_params_)

Mejor AUC: 0.9540060319958038
Varianza del AUC en CV: 0.00059464452669441
Mejores parámetros: OrderedDict([('model__max_depth', 4), ('model__n_estimators', 27)])


In [59]:
model = RandomForestClassifier(n_estimators= 27, max_depth= 4)

In [60]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
    ])
pipeline.fit(x, y)

y_score = pipeline.predict_proba(x)

auc_score = roc_auc_score(y, y_score[:, 1])
auc_score

0.9573849736775154