# Preparación de los datos

In [256]:
# Importación de las librerías necesarias
import pandas as pd
import numpy as np

### Carga del dataset y análisis
Primero cargamos el dataset utilizado en la práctica y visualizamos las columnas que lo componen, con su tipología y algunos ejemplos de valores.

In [257]:
# Carga y análisis previo del dataset en un DataFrame de Pandas

df = pd.read_csv('train.csv')

print(len(df)) # Visualización del número total de registros

df.head().T # Visualización de las columnas con algunas filas de ejemplo

891


Unnamed: 0,0,1,2,3,4
PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Pclass,3,1,3,1,3
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Sex,male,female,female,female,male
Age,22.0,38.0,26.0,35.0,35.0
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450
Fare,7.25,71.2833,7.925,53.1,8.05


In [258]:
# Estadísticas genéricas del dataset, por columnas
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [259]:
# Visualización de los tipos de datos que componen el dataset
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [260]:
# Función de reemplazo
replacer = lambda str: str.lower().str.replace(' ','_')

# Aplicación del reemplazo a los nombres de las columnas
#df.columns = replacer(df.columns.str)

# Aplicación del reemplazo a los valores de las columnas de tipo cadena
#for col in list(df.dtypes[df.dtypes == 'object'].index):
    #df[col] = replacer(df[col].str)
df.head().T

Unnamed: 0,0,1,2,3,4
PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Pclass,3,1,3,1,3
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Sex,male,female,female,female,male
Age,22.0,38.0,26.0,35.0,35.0
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450
Fare,7.25,71.2833,7.925,53.1,8.05


Tras esta conversión, analizamos el número de valores  **únicos** que tiene cada columna:

In [261]:
df.nunique() # Número de valores únicos por columna

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

# Creación del modelo

Lo primero que haremos será la extracción de nuestra variable objetivo, que en nuestro caso es *Survived*:

In [262]:
target_name = "Survived" # Variable objetivo
target = df[target_name]

data = df.drop(columns=[target_name, 'Name', 'Ticket', 'Cabin'])

A continuación, instanciamos 2 preprocesadores distintos para las columnas numéricas y para las categóricas, y lo vinculamos a un transformador por columnas:

In [263]:
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numerical_columns_selector = selector(dtype_exclude=object)  # Selector para extraer columnas numéricas
categorical_columns_selector = selector(dtype_include=object)  # Selector para extraer columnas categóricas

numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

numerical_preprocessor = StandardScaler() # Escalador para columnas numéricas
categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)

preprocessor = ColumnTransformer([
    ('categorical', categorical_preprocessor, categorical_columns),
    ('numerical', numerical_preprocessor, numerical_columns)])

Ahora instanciaremos un modelo y lo vincularemos mediante una *pipeline* a nuestro transformador por columnas. Para el problema expuesto, se elige un modelo de clasificación ***HistGradientBoostingClassifier***:

In [264]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier

model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", HistGradientBoostingClassifier(random_state=42) )
])

In [265]:
from scipy.stats import loguniform


class loguniform_int:
    """Integer valued version of the log-uniform distribution"""
    def __init__(self, a, b):
        self._distribution = loguniform(a, b)

    def rvs(self, *args, **kwargs):
        """Random variable sample"""
        return self._distribution.rvs(*args, **kwargs).astype(int)

In [266]:
#from sklearn.model_selection import cross_validate

#cv_result = cross_validate(model, data, target, cv=5)

#np.median(cv_result['test_score'])

from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'classifier__l2_regularization': loguniform(1e-6, 1e3),
    'classifier__learning_rate': loguniform(0.001, 10),
    'classifier__max_leaf_nodes': loguniform_int(2, 256),
    'classifier__min_samples_leaf': loguniform_int(1, 100),
    'classifier__max_bins': loguniform_int(2, 255),
}

model_random_search = RandomizedSearchCV(
    model, param_distributions=param_distributions, n_iter=10,
    cv=5, verbose=1,
)
model_random_search.fit(data, target)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/antoniojesusacostalopez/opt/anaconda3/envs/scikit-learn-course/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/antoniojesusacostalopez/opt/anaconda3/envs/scikit-learn-course/lib/python3.10/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/antoniojesusacostalopez/opt/anaconda3/envs/scikit-learn-course/lib/python3.10/site-packages/sklearn/ensemble/_hist_gradient_boosting/gradie

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('categorical',
                                                                               OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                                              unknown_value=-1),
                                                                               ['Sex',
                                                                                'Embarked']),
                                                                              ('numerical',
                                                                               StandardScaler(),
                                                                               ['PassengerId',
                                                                                'Pclass',
                     

In [267]:
#model.fit(data, target)

In [268]:
df = pd.read_csv('test.csv')

#prediction = model.predict(df)
prediction = model_random_search.predict(df)

In [269]:
result = pd.DataFrame()
result['PassengerId'] = df['PassengerId']
result['Survived'] = prediction

result.to_csv('prediction.csv', index=False)