<a href="https://colab.research.google.com/github/Argentan/DMA_LAB2/blob/master/tutoriales/06_optimizacion_hiperparametros_practica.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
import pandas as pd
try:
    data = pd.read_csv("../data/titanic_proc.csv", index_col="PassengerId")
except:
    data = pd.read_csv("https://raw.githubusercontent.com/Argentan/DMA_LAB2/master/data/titanic_proc.csv", index_col="PassengerId")
data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,NumFam,C,Q,S,Age_nul
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,3,0,22.0,1,0,0,7.25,0,1,0,0,1,0
2,1,1,1,38.0,1,0,1,71.2833,1,1,1,0,0,0
3,1,3,1,26.0,0,0,2,7.925,0,0,0,0,1,0
4,1,1,1,35.0,1,0,3,53.1,2,1,0,0,1,0
5,0,3,0,35.0,0,0,4,8.05,0,0,0,0,1,0


In [45]:
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, KFold, ParameterGrid

In [46]:
train, test = train_test_split(data, test_size=0.2, random_state=1)

In [47]:
folds = [train.index[idx] for _, idx in KFold(5, shuffle=True, random_state=1).split(train)]

In [53]:
candidatos = [
    {"algoritmo": LogisticRegression, "params": {"C": [0.1, 1], "penalty": [ "l2"]}},
    {"algoritmo": LGBMClassifier, "params": {"num_leaves": [10 , 20], "max_depth": [4 , 6, 8]}}
]

In [81]:
valid_probs = []
test_probs = []
for candidato in candidatos:
    for params in ParameterGrid(candidato["params"]):
        test_fold_probs = []
        valid_fold_probs = []
        name = candidato["algoritmo"].__name__ + ";" + ";".join([f"{k}_{v}" for k, v in params.items()])
        for valid_idx in folds:
            X_valid = train.loc[valid_idx].drop("Survived", axis=1)
            # y_valid = train.loc[valid_idx, "Survived"]

            X_train = train.drop(valid_idx).drop("Survived", axis=1)
            y_train = train.loc[X_train.index, "Survived"]

            model = candidato["algoritmo"](**params)
            model.fit(X_train, y_train)

            p = model.predict_proba(test.drop("Survived", axis=1))[:, -1]
            test_fold_probs.append(pd.Series(p, name=name, index=test.index))

            p = model.predict_proba(X_valid)[:, -1]
            valid_fold_probs.append(pd.Series(p, name=name, index=X_valid.index))
        test_probs.append(pd.concat(test_fold_probs, axis=1).mean(axis=1).rename(name))
        valid_probs.append(pd.concat(valid_fold_probs))
valid_probs = pd.concat(valid_probs, axis=1)
test_probs = pd.concat(test_probs, axis=1)

In [84]:
from sklearn.metrics import roc_auc_score

In [89]:
valid_res = pd.Series([roc_auc_score(train.loc[valid_probs.index, "Survived"], valid_probs[c])
                       for c in valid_probs],
                       name="resultados", index=valid_probs.columns)
valid_res

LogisticRegression;C_0.1;penalty_l2         0.851024
LogisticRegression;C_1;penalty_l2           0.854876
LGBMClassifier;max_depth_4;num_leaves_10    0.866498
LGBMClassifier;max_depth_4;num_leaves_20    0.866020
LGBMClassifier;max_depth_6;num_leaves_10    0.863469
LGBMClassifier;max_depth_6;num_leaves_20    0.860842
LGBMClassifier;max_depth_8;num_leaves_10    0.866104
LGBMClassifier;max_depth_8;num_leaves_20    0.855363
Name: resultados, dtype: float64

In [93]:
test_res = pd.Series([roc_auc_score(test.loc[test_probs.index, "Survived"], test_probs[c])
                       for c in test_probs],
                       name="resultados", index=test_probs.columns)
test_res

LogisticRegression;C_0.1;penalty_l2         0.810933
LogisticRegression;C_1;penalty_l2           0.817136
LGBMClassifier;max_depth_4;num_leaves_10    0.830318
LGBMClassifier;max_depth_4;num_leaves_20    0.833032
LGBMClassifier;max_depth_6;num_leaves_10    0.830576
LGBMClassifier;max_depth_6;num_leaves_20    0.823339
LGBMClassifier;max_depth_8;num_leaves_10    0.826570
LGBMClassifier;max_depth_8;num_leaves_20    0.827604
Name: resultados, dtype: float64

In [91]:
valid_res.idxmax()

'LGBMClassifier;max_depth_4;num_leaves_10'

In [83]:
test_probs

Unnamed: 0_level_0,LogisticRegression;C_0.1;penalty_l2,LogisticRegression;C_1;penalty_l2,LGBMClassifier;max_depth_4;num_leaves_10,LGBMClassifier;max_depth_4;num_leaves_20,LGBMClassifier;max_depth_6;num_leaves_10,LGBMClassifier;max_depth_6;num_leaves_20,LGBMClassifier;max_depth_8;num_leaves_10,LGBMClassifier;max_depth_8;num_leaves_20
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
863,0.909124,0.957677,0.983129,0.984062,0.969091,0.990346,0.988083,0.993439
224,0.117250,0.070377,0.071113,0.064847,0.062606,0.072216,0.068062,0.039835
85,0.674515,0.799713,0.878911,0.861216,0.901857,0.909612,0.910817,0.901304
681,0.543710,0.633973,0.721996,0.720225,0.753568,0.724417,0.757998,0.746609
536,0.773066,0.855788,0.968034,0.967227,0.974291,0.970883,0.968220,0.970596
...,...,...,...,...,...,...,...,...
797,0.906505,0.955890,0.976618,0.978694,0.952203,0.983881,0.976931,0.987048
816,0.654287,0.642043,0.125009,0.134241,0.141300,0.128170,0.139161,0.125547
630,0.157757,0.105379,0.058120,0.057512,0.043997,0.026526,0.038670,0.020757
422,0.214196,0.169956,0.120113,0.116556,0.108569,0.110131,0.105010,0.081127
