In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
#### Calibración de hiperparámetros y regularización

data = sns.load_dataset('titanic')
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [None]:
VARS = ['pclass','sex','age','sibsp','parch','fare']
TARGET = ['survived']
x = data[VARS]
y = data[TARGET]

In [None]:
### Train test split

from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 123)

In [None]:
### Dataset de validación

x_train_d, x_val, y_train_d, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state = 123)

In [None]:
### Variables categóricas

from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')
x_train_cat = pd.DataFrame(enc.fit_transform(x_train_d[['sex']]).toarray(), index = x_train_d.index, columns = enc.categories_[0])
x_val_cat = pd.DataFrame(enc.transform(x_val[['sex']]).toarray(), index = x_val.index, columns = enc.categories_[0])
x_test_cat = pd.DataFrame(enc.transform(x_test[['sex']]).toarray(), index = x_test.index, columns = enc.categories_[0])
x_train_total = pd.DataFrame(enc.transform(x_train[['sex']]).toarray(), index = x_train.index, columns = enc.categories_[0])

In [None]:
### Preprocesamiento de datos

from sklearn.impute import KNNImputer

imputer = KNNImputer()

x_train_no_missing = pd.DataFrame(imputer.fit_transform(x_train_d[['pclass','age','sibsp','parch','fare']]), index = x_train_d.index, columns = ['pclass','age','sibsp','parch','fare'])
x_val_no_missing = pd.DataFrame(imputer.transform(x_val[['pclass','age','sibsp','parch','fare']]), index = x_val.index, columns = ['pclass','age','sibsp','parch','fare'])
x_test_no_missing = pd.DataFrame(imputer.transform(x_test[['pclass','age','sibsp','parch','fare']]), index = x_test.index, columns = ['pclass','age','sibsp','parch','fare'])
x_train_no_missing_final = pd.DataFrame(imputer.transform(x_train[['pclass','age','sibsp','parch','fare']]), index = x_train.index, columns = ['pclass','age','sibsp','parch','fare'])

In [None]:
x_train_final = pd.concat([x_train_cat, x_train_no_missing], axis = 1)
x_val_final = pd.concat([x_val_cat, x_val_no_missing], axis = 1)
x_test_final = pd.concat([x_test_cat, x_test_no_missing], axis = 1)
x_train_total_final = pd.concat([x_train_total, x_train_no_missing_final], axis = 1)

In [None]:
### Escalamiento de datos

from sklearn.preprocessing import StandardScaler

esc = StandardScaler()
x_train_final_esc = pd.DataFrame(esc.fit_transform(x_train_final), index = x_train_final.index, columns = x_train_final.columns)
x_val_final_esc = pd.DataFrame(esc.transform(x_val_final), index = x_val_final.index, columns = x_val_final.columns)
x_test_final_esc = pd.DataFrame(esc.transform(x_test_final), index = x_test_final.index, columns = x_test_final.columns)
x_train_final_total_esc = pd.DataFrame(esc.transform(x_train_total_final), index = x_train_total_final.index, columns = x_train_total_final.columns)

In [None]:
### Modelamiento

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

modelo_uno = LogisticRegression(penalty = 'l2', solver = 'saga', C = 0.04)
modelo_dos = LogisticRegression(penalty = 'l1', solver = 'saga', C = 0.02)
modelo_tres = LogisticRegression(penalty = 'elasticnet', solver = 'saga', C = 0.04, l1_ratio = 0.8)
modelo_arbol = DecisionTreeClassifier(max_depth = 19)


def training(modelo, x_train, y_train, x_val, y_val):
  modelo.fit(x_train, y_train)
  preds = modelo.predict(x_val)
  score = accuracy_score(preds, y_val)
  print(score)
  return modelo, preds, score

training(modelo_uno, x_train_final_esc, y_train_d, x_val_final_esc, y_val)
training(modelo_dos, x_train_final_esc, y_train_d, x_val_final_esc, y_val)
training(modelo_tres, x_train_final_esc, y_train_d, x_val_final_esc, y_val)
training(modelo_arbol, x_train_final_esc, y_train_d, x_val_final_esc, y_val)

0.8181818181818182
0.8041958041958042
0.8041958041958042
0.7902097902097902


(DecisionTreeClassifier(max_depth=19),
 array([0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
        0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,
        1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
        0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0]),
 0.7902097902097902)

In [None]:
### GridSearchCV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
modelo_uno = LogisticRegression(penalty = 'l2', solver = 'saga', C = 0.04)
parameters = {'penalty':('l1', 'l2'), 'C': np.arange(0.01,1,0.01)}

grid_search = GridSearchCV(modelo_uno, parameters)
grid_search.fit(x_train_final_total_esc, y_train)





In [None]:
modelo_final = LogisticRegression(**grid_search.best_params_)
modelo_final.fit(x_train_final_esc, y_train_d)
preds = modelo_final.predict(x_test_final_esc)
accuracy_score(preds, y_test)

0.7932960893854749

In [None]:
### RandomSearchCV
modelo_uno = LogisticRegression(penalty = 'l2', solver = 'saga', C = 0.04)
parameters = {'penalty':('l1', 'l2'), 'C': np.arange(0.01,1,0.01)}

random_search = RandomizedSearchCV(modelo_uno, parameters)
random_search.fit(x_train_final_total_esc, y_train)


In [None]:
random_search.best_params_

{'penalty': 'l2', 'C': 0.41000000000000003}

In [None]:
modelo_final = LogisticRegression(**random_search.best_params_)
modelo_final.fit(x_train_final_esc, y_train_d)
preds = modelo_final.predict(x_test_final_esc)
accuracy_score(preds, y_test)

0.8044692737430168