# Objetivo del notebook

Este notebook tiene como finalidad principal el procesamiento conjunto de datos del que se dispone para llevar a cabo el desarrollo de un modelo capaz de precedir la supervivencia de los pasajeros que navegaron en el Titanic, en funcion de diferentes caracteristicas que evaluaremos a continuacion.

# Importar las librerias necesarias

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler

In [2]:
# Instancio un objeto DataFrame con el conjunto de datos a utilizar
dataset_train__route = "../data/raw/train.csv"
dataset_test__route = "../data/raw/test.csv"

train_df = pd.read_csv(dataset_train__route)
test_df = pd.read_csv(dataset_test__route)

In [3]:
# Muestro los 5 primeros ejemplos del dataset de entrenamiento
train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Muestro los 5 primeros ejemplos del dataset de testing
test_df.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## PREPROCESAMIENTO -----------------------------

## Seleccion de columnas --------------------------------
 
### Dataset entrenamiento

In [5]:
train_df.drop(columns = ['PassengerId', 'Name', 'Ticket'], inplace = True)
test_df.drop(columns = ['PassengerId', 'Name', 'Ticket'], inplace = True)

### Subset de entrenamiento ===================================

## Codificacion OneHot ----------------------------

Lo primero que voy a hacer es procesar algunas de las columnas, que contienen valores categoricos,  a fin de poder utilizar dichos valores en el desarrollo del modelo.

Las variables para las que es necesario aplicar una codificacion OneHot son:

* Pclass

* Sex

* Embarked

### Codificacion OneHot Pclass

In [6]:
valores_posibles_Pclass = train_df['Pclass'].unique()
valores_posibles_Pclass

array([3, 1, 2], dtype=int64)

In [7]:
# Genero un dataframe Dummy con los registros de dicha columna
dummy_pclass_df = pd.get_dummies(train_df['Pclass']).astype(int)

# Elimino una de las columnas del dataframe (en codificacion OneHot, una columna siempre se va a poder predecir con el valor de las otras) para evitar sufrir de multicolinealidad.
dummy_pclass_df.drop(columns = [3], inplace = True)

# Renombro las columnas faltantes
dummy_pclass_df.rename(columns = {1: 'Pclass__1', 2: 'Pclass__2'}, inplace = True)

# Elimino la columna original en el dataframe de entrenamiento
train_df.drop(columns = ['Pclass'], inplace = True)

# Concateno ambos dataframes
train_df = pd.concat([train_df, dummy_pclass_df], axis = 1)

# Muestro los 5 primeros registros del DataFrame
train_df

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Pclass__1,Pclass__2
0,0,male,22.0,1,0,7.2500,,S,0,0
1,1,female,38.0,1,0,71.2833,C85,C,1,0
2,1,female,26.0,0,0,7.9250,,S,0,0
3,1,female,35.0,1,0,53.1000,C123,S,1,0
4,0,male,35.0,0,0,8.0500,,S,0,0
...,...,...,...,...,...,...,...,...,...,...
886,0,male,27.0,0,0,13.0000,,S,0,1
887,1,female,19.0,0,0,30.0000,B42,S,1,0
888,0,female,,1,2,23.4500,,S,0,0
889,1,male,26.0,0,0,30.0000,C148,C,1,0


### Codificacion OneHot Sex

In [8]:
valores_posibles_Pclass = train_df['Sex'].unique()
valores_posibles_Pclass

array(['male', 'female'], dtype=object)

In [9]:
# Genero un dataframe Dummy con los registros de dicha columna
dummy_sex_df = pd.get_dummies(train_df['Sex']).astype(int)

# Elimino una de las columnas del dataframe (en codificacion OneHot, una columna siempre se va a poder predecir con el valor de las otras) para evitar sufrir de multicolinealidad.
dummy_sex_df.drop(columns = ['female'], inplace = True)

# Renombro las columnas faltantes
dummy_sex_df.rename(columns = {'male': 'Sex__male'}, inplace = True)

# Elimino la columna original en el dataframe de entrenamiento
train_df.drop(columns = ['Sex'], inplace = True)

# Concateno ambos dataframes
train_df = pd.concat([train_df, dummy_sex_df], axis = 1)

# Muestro los 5 primeros registros del DataFrame
train_df

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Cabin,Embarked,Pclass__1,Pclass__2,Sex__male
0,0,22.0,1,0,7.2500,,S,0,0,1
1,1,38.0,1,0,71.2833,C85,C,1,0,0
2,1,26.0,0,0,7.9250,,S,0,0,0
3,1,35.0,1,0,53.1000,C123,S,1,0,0
4,0,35.0,0,0,8.0500,,S,0,0,1
...,...,...,...,...,...,...,...,...,...,...
886,0,27.0,0,0,13.0000,,S,0,1,1
887,1,19.0,0,0,30.0000,B42,S,1,0,0
888,0,,1,2,23.4500,,S,0,0,0
889,1,26.0,0,0,30.0000,C148,C,1,0,1


### Codificacion OneHot Embarked

In [10]:
valores_posibles_Pclass = train_df['Embarked'].unique()
valores_posibles_Pclass

array(['S', 'C', 'Q', nan], dtype=object)

In [11]:
# Genero un dataframe Dummy con los registros de dicha columna
dummy_embarked_df = pd.get_dummies(train_df['Embarked']).astype(int)

# Elimino una de las columnas del dataframe (en codificacion OneHot, una columna siempre se va a poder predecir con el valor de las otras) para evitar sufrir de multicolinealidad.
dummy_embarked_df.drop(columns = ['Q'], inplace = True)

# Renombro las columnas faltantes
dummy_embarked_df.rename(columns = {'S': 'Embarked__S', 'C': 'Embarked__C'}, inplace = True)

# Elimino la columna original en el dataframe de entrenamiento
train_df.drop(columns = ['Embarked'], inplace = True)

# Concateno ambos dataframes
train_df = pd.concat([train_df, dummy_embarked_df], axis = 1)

# Muestro los 5 primeros registros del DataFrame
train_df

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Cabin,Pclass__1,Pclass__2,Sex__male,Embarked__C,Embarked__S
0,0,22.0,1,0,7.2500,,0,0,1,0,1
1,1,38.0,1,0,71.2833,C85,1,0,0,1,0
2,1,26.0,0,0,7.9250,,0,0,0,0,1
3,1,35.0,1,0,53.1000,C123,1,0,0,0,1
4,0,35.0,0,0,8.0500,,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.0,0,0,13.0000,,0,1,1,0,1
887,1,19.0,0,0,30.0000,B42,1,0,0,0,1
888,0,,1,2,23.4500,,0,0,0,0,1
889,1,26.0,0,0,30.0000,C148,1,0,1,1,0


## Normalizacion de los datos

In [12]:
scaler = MinMaxScaler()

X = train_df.drop(columns = ['Survived', 'Cabin'])
column_names = X.columns
column_cabin = train_df['Cabin']
y = train_df['Survived']

X = scaler.fit_transform(X)

train_df = pd.DataFrame(data = X,
                        columns = column_names)
train_df['Cabin'] = column_cabin
train_df['Survived'] = y

# Muestro los 5 primeros registros del dataframe normalizado
train_df

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass__1,Pclass__2,Sex__male,Embarked__C,Embarked__S,Cabin,Survived
0,0.271174,0.125,0.000000,0.014151,0.0,0.0,1.0,0.0,1.0,,0
1,0.472229,0.125,0.000000,0.139136,1.0,0.0,0.0,1.0,0.0,C85,1
2,0.321438,0.000,0.000000,0.015469,0.0,0.0,0.0,0.0,1.0,,1
3,0.434531,0.125,0.000000,0.103644,1.0,0.0,0.0,0.0,1.0,C123,1
4,0.434531,0.000,0.000000,0.015713,0.0,0.0,1.0,0.0,1.0,,0
...,...,...,...,...,...,...,...,...,...,...,...
886,0.334004,0.000,0.000000,0.025374,0.0,1.0,1.0,0.0,1.0,,0
887,0.233476,0.000,0.000000,0.058556,1.0,0.0,0.0,0.0,1.0,B42,1
888,,0.125,0.333333,0.045771,0.0,0.0,0.0,0.0,1.0,,0
889,0.321438,0.000,0.000000,0.058556,1.0,0.0,1.0,1.0,0.0,C148,1


### Columna Cabin

Esta columna, como se dijo en otro notebook, es muy probable que tenga una alta relacion con respecto a la probabilidad de supervivencia de un pasajero dado.

Para procesarla, voy a comenzar mostrando los valores registrados en ella (valores diferentes al resto, unicos)

In [13]:
# Genero una copia del dataframe original

train_df__2 = train_df.copy()

In [14]:
print(train_df__2['Cabin'].unique())

[nan 'C85' 'C123' 'E46' 'G6' 'C103' 'D56' 'A6' 'C23 C25 C27' 'B78' 'D33'
 'B30' 'C52' 'B28' 'C83' 'F33' 'F G73' 'E31' 'A5' 'D10 D12' 'D26' 'C110'
 'B58 B60' 'E101' 'F E69' 'D47' 'B86' 'F2' 'C2' 'E33' 'B19' 'A7' 'C49'
 'F4' 'A32' 'B4' 'B80' 'A31' 'D36' 'D15' 'C93' 'C78' 'D35' 'C87' 'B77'
 'E67' 'B94' 'C125' 'C99' 'C118' 'D7' 'A19' 'B49' 'D' 'C22 C26' 'C106'
 'C65' 'E36' 'C54' 'B57 B59 B63 B66' 'C7' 'E34' 'C32' 'B18' 'C124' 'C91'
 'E40' 'T' 'C128' 'D37' 'B35' 'E50' 'C82' 'B96 B98' 'E10' 'E44' 'A34'
 'C104' 'C111' 'C92' 'E38' 'D21' 'E12' 'E63' 'A14' 'B37' 'C30' 'D20' 'B79'
 'E25' 'D46' 'B73' 'C95' 'B38' 'B39' 'B22' 'C86' 'C70' 'A16' 'C101' 'C68'
 'A10' 'E68' 'B41' 'A20' 'D19' 'D50' 'D9' 'A23' 'B50' 'A26' 'D48' 'E58'
 'C126' 'B71' 'B51 B53 B55' 'D49' 'B5' 'B20' 'F G63' 'C62 C64' 'E24' 'C90'
 'C45' 'E8' 'B101' 'D45' 'C46' 'D30' 'E121' 'D11' 'E77' 'F38' 'B3' 'D6'
 'B82 B84' 'D17' 'A36' 'B102' 'B69' 'E49' 'C47' 'D28' 'E17' 'A24' 'C50'
 'B42' 'C148']


Todos los registros almacenan el identificador de la cubierta en la que se encuentra la habitacion. Tambien se puede apreciar, que en algunos casos, se tiene registradas dos habitaciones para un mismo registro, como en el caso 'B57 B59 B63 B66'. Cuando esto sucede, se debe a que dicho pasajero o grupo de pasajeros tenia acceso a todas las cubiertas mencionadas.

In [15]:
array_cabin = []
for registro in train_df__2['Cabin']:
    if pd.isnull(registro):
        array_cabin.append(np.nan)   # Lo registro dentro del array de cabinas, no lo elimino
    else:
        cubierta_registro = registro[0]
        if cubierta_registro == "A":
            array_cabin.append(1)
        elif cubierta_registro == "B":
            array_cabin.append(2)
        elif cubierta_registro == "C":
            array_cabin.append(3)
        elif cubierta_registro == "D":
            array_cabin.append(4)
        elif cubierta_registro == "E":
            array_cabin.append(5)
        elif cubierta_registro == "F":
            array_cabin.append(6)
        elif cubierta_registro == "G":
            array_cabin.append(7)
        elif cubierta_registro == "T":
            array_cabin.append(8)

In [16]:
array_cabin[:10]

[nan, 3, nan, 3, nan, nan, 5, nan, nan, nan]

In [17]:
train_df__2['Cabin'] = array_cabin

In [18]:
train_df__2

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass__1,Pclass__2,Sex__male,Embarked__C,Embarked__S,Cabin,Survived
0,0.271174,0.125,0.000000,0.014151,0.0,0.0,1.0,0.0,1.0,,0
1,0.472229,0.125,0.000000,0.139136,1.0,0.0,0.0,1.0,0.0,3.0,1
2,0.321438,0.000,0.000000,0.015469,0.0,0.0,0.0,0.0,1.0,,1
3,0.434531,0.125,0.000000,0.103644,1.0,0.0,0.0,0.0,1.0,3.0,1
4,0.434531,0.000,0.000000,0.015713,0.0,0.0,1.0,0.0,1.0,,0
...,...,...,...,...,...,...,...,...,...,...,...
886,0.334004,0.000,0.000000,0.025374,0.0,1.0,1.0,0.0,1.0,,0
887,0.233476,0.000,0.000000,0.058556,1.0,0.0,0.0,0.0,1.0,2.0,1
888,,0.125,0.333333,0.045771,0.0,0.0,0.0,0.0,1.0,,0
889,0.321438,0.000,0.000000,0.058556,1.0,0.0,1.0,1.0,0.0,3.0,1


## Manejo de valores nulos ----------------------------

El segundo paso que voy a llevar a cabo en el preprocesamiento de mi conjunto de datos, es la imputacion de los valores nulos registrados en cada uno de los subsets de los que
dispongo.

En este paso, y para este dataset en concreto, voy a tratar de imputar los valores faltantes mediante un enfoque basado en KNN, puesto que es muy probable que cometamos un error al tratar de rellenar los valores nulos aplicando la media o mediana de las columnas. Esto se debe a que la probabilidad de supervivencia de un pasajero viene dada por diferentes variables, y no es correcto utilizar este tipo de imputacion en estos casos.

Para ser mas precisos, voy a imputar los datos faltantes aplicando un enfoque basado en KNN.

In [19]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors= 5, weights='distance')

# Seleccionar columnas para la imputación, incluyendo características demográficas, de viaje, y las columnas de cubiertas
columns_for_imputation = train_df__2.columns.tolist()  # Ajusta esta lista según sea necesario

# Aplicar la imputación
imputed_data = imputer.fit_transform(train_df__2[columns_for_imputation])

# Convertir los datos imputados de nuevo a un DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=columns_for_imputation)

# Si es necesario, redondear los valores de las columnas multihot de cubiertas después de la imputación
# Esto depende de si decides tratar estas columnas estrictamente como categóricas binarias post-imputación

# Actualizar el DataFrame original con los valores imputados
train_df__2[columns_for_imputation] = imputed_df

In [20]:
train_df__2

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass__1,Pclass__2,Sex__male,Embarked__C,Embarked__S,Cabin,Survived
0,0.271174,0.125,0.000000,0.014151,0.0,0.0,1.0,0.0,1.0,5.714830,0.0
1,0.472229,0.125,0.000000,0.139136,1.0,0.0,0.0,1.0,0.0,3.000000,1.0
2,0.321438,0.000,0.000000,0.015469,0.0,0.0,0.0,0.0,1.0,5.920444,1.0
3,0.434531,0.125,0.000000,0.103644,1.0,0.0,0.0,0.0,1.0,3.000000,1.0
4,0.434531,0.000,0.000000,0.015713,0.0,0.0,1.0,0.0,1.0,5.809954,0.0
...,...,...,...,...,...,...,...,...,...,...,...
886,0.334004,0.000,0.000000,0.025374,0.0,1.0,1.0,0.0,1.0,5.706603,0.0
887,0.233476,0.000,0.000000,0.058556,1.0,0.0,0.0,0.0,1.0,2.000000,1.0
888,0.268979,0.125,0.333333,0.045771,0.0,0.0,0.0,0.0,1.0,6.856632,0.0
889,0.321438,0.000,0.000000,0.058556,1.0,0.0,1.0,1.0,0.0,3.000000,1.0


In [21]:
# Redondeo a 1 solo digito los valores de la columna 'Cabin'
train_df__2['Cabin'] = round(train_df__2['Cabin'])
train_df__2

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass__1,Pclass__2,Sex__male,Embarked__C,Embarked__S,Cabin,Survived
0,0.271174,0.125,0.000000,0.014151,0.0,0.0,1.0,0.0,1.0,6.0,0.0
1,0.472229,0.125,0.000000,0.139136,1.0,0.0,0.0,1.0,0.0,3.0,1.0
2,0.321438,0.000,0.000000,0.015469,0.0,0.0,0.0,0.0,1.0,6.0,1.0
3,0.434531,0.125,0.000000,0.103644,1.0,0.0,0.0,0.0,1.0,3.0,1.0
4,0.434531,0.000,0.000000,0.015713,0.0,0.0,1.0,0.0,1.0,6.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
886,0.334004,0.000,0.000000,0.025374,0.0,1.0,1.0,0.0,1.0,6.0,0.0
887,0.233476,0.000,0.000000,0.058556,1.0,0.0,0.0,0.0,1.0,2.0,1.0
888,0.268979,0.125,0.333333,0.045771,0.0,0.0,0.0,0.0,1.0,7.0,0.0
889,0.321438,0.000,0.000000,0.058556,1.0,0.0,1.0,1.0,0.0,3.0,1.0


## Termino de procesar la columna 'Cabin'...

In [22]:
array_cabin = []
for cubierta_registro in train_df__2['Cabin']:
    if cubierta_registro == 1.0:
        array_cabin.append("A")
    elif cubierta_registro == 2.0:
        array_cabin.append("B")
    elif cubierta_registro == 3.0:
        array_cabin.append("C")
    elif cubierta_registro == 4.0:
        array_cabin.append("D")
    elif cubierta_registro == 5.0:
        array_cabin.append("E")
    elif cubierta_registro == 6.0:
        array_cabin.append("F")
    elif cubierta_registro == 7.0:
        array_cabin.append("G")
    elif cubierta_registro == 8.0:
        array_cabin.append("T")

In [23]:
# Muestro el array generado
array_cabin[:10]

['F', 'C', 'F', 'C', 'F', 'F', 'E', 'F', 'F', 'D']

In [24]:
# Convertimos array_cabin a una Serie para facilitar su manejo
array_cabin_serie = pd.Series(array_cabin)

# Inicializamos las columnas de cubiertas con ceros
for letra_cubierta in 'ABCDEFGT':  # Asumiendo que estas son todas las cubiertas posibles
    train_df["Cabin__{}".format(letra_cubierta)] = 0

# Actualizamos las columnas basándonos en los valores de array_cabin
for i, cubiertas in enumerate(array_cabin_serie):
    if not pd.isnull(cubiertas):
        if isinstance(cubiertas, set):
            # Si el pasajero tiene acceso a múltiples cubiertas
            for cubierta in cubiertas:
                train_df.at[i, "Cabin__{}".format(cubierta)] = 1  # Corrección aquí
        else:
            # Si el pasajero tiene acceso a una única cubierta
            train_df.at[i, "Cabin__{}".format(cubiertas)] = 1  # Corrección aquí


In [25]:
# Elimino la columna original de 'Cabin'
train_df.drop(columns = ['Cabin'], inplace = True)

In [26]:
train_df['Age'] = train_df__2['Age']
train_df

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass__1,Pclass__2,Sex__male,Embarked__C,Embarked__S,Survived,Cabin__A,Cabin__B,Cabin__C,Cabin__D,Cabin__E,Cabin__F,Cabin__G,Cabin__T
0,0.271174,0.125,0.000000,0.014151,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,0,1,0,0
1,0.472229,0.125,0.000000,0.139136,1.0,0.0,0.0,1.0,0.0,1,0,0,1,0,0,0,0,0
2,0.321438,0.000,0.000000,0.015469,0.0,0.0,0.0,0.0,1.0,1,0,0,0,0,0,1,0,0
3,0.434531,0.125,0.000000,0.103644,1.0,0.0,0.0,0.0,1.0,1,0,0,1,0,0,0,0,0
4,0.434531,0.000,0.000000,0.015713,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.334004,0.000,0.000000,0.025374,0.0,1.0,1.0,0.0,1.0,0,0,0,0,0,0,1,0,0
887,0.233476,0.000,0.000000,0.058556,1.0,0.0,0.0,0.0,1.0,1,0,1,0,0,0,0,0,0
888,0.268979,0.125,0.333333,0.045771,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,0,1,0
889,0.321438,0.000,0.000000,0.058556,1.0,0.0,1.0,1.0,0.0,1,0,0,1,0,0,0,0,0


### Subset de testing ===================================

## Codificacion OneHot ----------------------------

### Codificacion OneHot Pclass

In [27]:
valores_posibles_Pclass = test_df['Pclass'].unique()
valores_posibles_Pclass

array([3, 2, 1], dtype=int64)

In [28]:
# Genero un dataframe Dummy con los registros de dicha columna
dummy_pclass_df = pd.get_dummies(test_df['Pclass']).astype(int)

# Elimino una de las columnas del dataframe (en codificacion OneHot, una columna siempre se va a poder predecir con el valor de las otras) para evitar sufrir de multicolinealidad.
dummy_pclass_df.drop(columns = [3], inplace = True)

# Renombro las columnas faltantes
dummy_pclass_df.rename(columns = {1: 'Pclass__1', 2: 'Pclass__2'}, inplace = True)

# Elimino la columna original en el dataframe de entrenamiento
test_df.drop(columns = ['Pclass'], inplace = True)

# Concateno ambos dataframes
test_df = pd.concat([test_df, dummy_pclass_df], axis = 1)

# Muestro los 5 primeros registros del DataFrame
test_df

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Pclass__1,Pclass__2
0,male,34.5,0,0,7.8292,,Q,0,0
1,female,47.0,1,0,7.0000,,S,0,0
2,male,62.0,0,0,9.6875,,Q,0,1
3,male,27.0,0,0,8.6625,,S,0,0
4,female,22.0,1,1,12.2875,,S,0,0
...,...,...,...,...,...,...,...,...,...
413,male,,0,0,8.0500,,S,0,0
414,female,39.0,0,0,108.9000,C105,C,1,0
415,male,38.5,0,0,7.2500,,S,0,0
416,male,,0,0,8.0500,,S,0,0


### Codificacion OneHot Sex

In [29]:
valores_posibles_Pclass = test_df['Sex'].unique()
valores_posibles_Pclass

array(['male', 'female'], dtype=object)

In [30]:
# Genero un dataframe Dummy con los registros de dicha columna
dummy_sex_df = pd.get_dummies(test_df['Sex']).astype(int)

# Elimino una de las columnas del dataframe (en codificacion OneHot, una columna siempre se va a poder predecir con el valor de las otras) para evitar sufrir de multicolinealidad.
dummy_sex_df.drop(columns = ['female'], inplace = True)

# Renombro las columnas faltantes
dummy_sex_df.rename(columns = {'male': 'Sex__male'}, inplace = True)

# Elimino la columna original en el dataframe de entrenamiento
test_df.drop(columns = ['Sex'], inplace = True)

# Concateno ambos dataframes
test_df = pd.concat([test_df, dummy_sex_df], axis = 1)

# Muestro los 5 primeros registros del DataFrame
test_df

Unnamed: 0,Age,SibSp,Parch,Fare,Cabin,Embarked,Pclass__1,Pclass__2,Sex__male
0,34.5,0,0,7.8292,,Q,0,0,1
1,47.0,1,0,7.0000,,S,0,0,0
2,62.0,0,0,9.6875,,Q,0,1,1
3,27.0,0,0,8.6625,,S,0,0,1
4,22.0,1,1,12.2875,,S,0,0,0
...,...,...,...,...,...,...,...,...,...
413,,0,0,8.0500,,S,0,0,1
414,39.0,0,0,108.9000,C105,C,1,0,0
415,38.5,0,0,7.2500,,S,0,0,1
416,,0,0,8.0500,,S,0,0,1


### Codificacion OneHot Embarked

In [31]:
valores_posibles_Pclass = test_df['Embarked'].unique()
valores_posibles_Pclass

array(['Q', 'S', 'C'], dtype=object)

In [32]:
# Genero un dataframe Dummy con los registros de dicha columna
dummy_embarked_df = pd.get_dummies(test_df['Embarked']).astype(int)

# Elimino una de las columnas del dataframe (en codificacion OneHot, una columna siempre se va a poder predecir con el valor de las otras) para evitar sufrir de multicolinealidad.
dummy_embarked_df.drop(columns = ['Q'], inplace = True)

# Renombro las columnas faltantes
dummy_embarked_df.rename(columns = {'S': 'Embarked__S', 'C': 'Embarked__C'}, inplace = True)

# Elimino la columna original en el dataframe de entrenamiento
test_df.drop(columns = ['Embarked'], inplace = True)

# Concateno ambos dataframes
test_df = pd.concat([test_df, dummy_embarked_df], axis = 1)

# Muestro los 5 primeros registros del DataFrame
test_df

Unnamed: 0,Age,SibSp,Parch,Fare,Cabin,Pclass__1,Pclass__2,Sex__male,Embarked__C,Embarked__S
0,34.5,0,0,7.8292,,0,0,1,0,0
1,47.0,1,0,7.0000,,0,0,0,0,1
2,62.0,0,0,9.6875,,0,1,1,0,0
3,27.0,0,0,8.6625,,0,0,1,0,1
4,22.0,1,1,12.2875,,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
413,,0,0,8.0500,,0,0,1,0,1
414,39.0,0,0,108.9000,C105,1,0,0,1,0
415,38.5,0,0,7.2500,,0,0,1,0,1
416,,0,0,8.0500,,0,0,1,0,1


## Normalizacion de los datos

In [33]:
scaler = MinMaxScaler()

X = test_df.drop(columns = ['Cabin'])
column_names = X.columns
column_cabin = test_df['Cabin']

X = scaler.fit_transform(X)

test_df = pd.DataFrame(data = X,
                       columns = column_names)
test_df['Cabin'] = column_cabin

# Muestro los 5 primeros registros del dataframe normalizado
test_df.head(5)

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass__1,Pclass__2,Sex__male,Embarked__C,Embarked__S,Cabin
0,0.452723,0.0,0.0,0.015282,0.0,0.0,1.0,0.0,0.0,
1,0.617566,0.125,0.0,0.013663,0.0,0.0,0.0,0.0,1.0,
2,0.815377,0.0,0.0,0.018909,0.0,1.0,1.0,0.0,0.0,
3,0.353818,0.0,0.0,0.016908,0.0,0.0,1.0,0.0,1.0,
4,0.287881,0.125,0.111111,0.023984,0.0,0.0,0.0,0.0,1.0,


### Columna Cabin


In [34]:
# Genero una copia del dataframe original

test_df__2 = test_df.copy()

In [35]:
array_cabin = []
for registro in test_df__2['Cabin']:
    if pd.isnull(registro):
        array_cabin.append(np.nan)   # Lo registro dentro del array de cabinas, no lo elimino
    else:
        cubierta_registro = registro[0]
        if cubierta_registro == "A":
            array_cabin.append(1)
        elif cubierta_registro == "B":
            array_cabin.append(2)
        elif cubierta_registro == "C":
            array_cabin.append(3)
        elif cubierta_registro == "D":
            array_cabin.append(4)
        elif cubierta_registro == "E":
            array_cabin.append(5)
        elif cubierta_registro == "F":
            array_cabin.append(6)
        elif cubierta_registro == "G":
            array_cabin.append(7)
        elif cubierta_registro == "T":
            array_cabin.append(8)

In [36]:
test_df__2['Cabin'] = array_cabin

## Manejo de valores nulos ----------------------------

In [37]:
imputer = KNNImputer(n_neighbors= 5, weights='distance')

# Seleccionar columnas para la imputación, incluyendo características demográficas, de viaje, y las columnas de cubiertas
columns_for_imputation = test_df__2.columns.tolist()  # Ajusta esta lista según sea necesario

# Aplicar la imputación
imputed_data = imputer.fit_transform(test_df__2[columns_for_imputation])

# Convertir los datos imputados de nuevo a un DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=columns_for_imputation)

# Si es necesario, redondear los valores de las columnas multihot de cubiertas después de la imputación
# Esto depende de si decides tratar estas columnas estrictamente como categóricas binarias post-imputación

# Actualizar el DataFrame original con los valores imputados
test_df__2[columns_for_imputation] = imputed_df

In [38]:
# Redondeo a 1 solo digito los valores de la columna 'Cabin'
test_df__2['Cabin'] = round(test_df__2['Cabin'])
test_df__2

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass__1,Pclass__2,Sex__male,Embarked__C,Embarked__S,Cabin
0,0.452723,0.000,0.000000,0.015282,0.0,0.0,1.0,0.0,0.0,5.0
1,0.617566,0.125,0.000000,0.013663,0.0,0.0,0.0,0.0,1.0,5.0
2,0.815377,0.000,0.000000,0.018909,0.0,1.0,1.0,0.0,0.0,5.0
3,0.353818,0.000,0.000000,0.016908,0.0,0.0,1.0,0.0,1.0,6.0
4,0.287881,0.125,0.111111,0.023984,0.0,0.0,0.0,0.0,1.0,5.0
...,...,...,...,...,...,...,...,...,...,...
413,0.614269,0.000,0.000000,0.015713,0.0,0.0,1.0,0.0,1.0,6.0
414,0.512066,0.000,0.000000,0.212559,1.0,0.0,0.0,1.0,0.0,3.0
415,0.505473,0.000,0.000000,0.014151,0.0,0.0,1.0,0.0,1.0,5.0
416,0.614269,0.000,0.000000,0.015713,0.0,0.0,1.0,0.0,1.0,6.0


## Termino de procesar la columna 'Cabin'...

In [39]:
array_cabin = []
for cubierta_registro in test_df__2['Cabin']:
    if cubierta_registro == 1.0:
        array_cabin.append("A")
    elif cubierta_registro == 2.0:
        array_cabin.append("B")
    elif cubierta_registro == 3.0:
        array_cabin.append("C")
    elif cubierta_registro == 4.0:
        array_cabin.append("D")
    elif cubierta_registro == 5.0:
        array_cabin.append("E")
    elif cubierta_registro == 6.0:
        array_cabin.append("F")
    elif cubierta_registro == 7.0:
        array_cabin.append("G")
    elif cubierta_registro == 8.0:
        array_cabin.append("T")

In [40]:
# Convertimos array_cabin a una Serie para facilitar su manejo
array_cabin_serie = pd.Series(array_cabin)

# Inicializamos las columnas de cubiertas con ceros
for letra_cubierta in 'ABCDEFGT':  # Asumiendo que estas son todas las cubiertas posibles
    test_df["Cabin__{}".format(letra_cubierta)] = 0

# Actualizamos las columnas basándonos en los valores de array_cabin
for i, cubiertas in enumerate(array_cabin_serie):
    if not pd.isnull(cubiertas):
        if isinstance(cubiertas, set):
            # Si el pasajero tiene acceso a múltiples cubiertas
            for cubierta in cubiertas:
                test_df.at[i, "Cabin__{}".format(cubierta)] = 1  
        else:
            # Si el pasajero tiene acceso a una única cubierta
            test_df.at[i, "Cabin__{}".format(cubiertas)] = 1 

In [41]:
# Elimino la columna original de 'Cabin'
test_df.drop(columns = ['Cabin'], inplace = True)

test_df['Age'] = test_df__2['Age']
test_df['Fare'] = test_df__2['Fare']
test_df

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass__1,Pclass__2,Sex__male,Embarked__C,Embarked__S,Cabin__A,Cabin__B,Cabin__C,Cabin__D,Cabin__E,Cabin__F,Cabin__G,Cabin__T
0,0.452723,0.000,0.000000,0.015282,0.0,0.0,1.0,0.0,0.0,0,0,0,0,1,0,0,0
1,0.617566,0.125,0.000000,0.013663,0.0,0.0,0.0,0.0,1.0,0,0,0,0,1,0,0,0
2,0.815377,0.000,0.000000,0.018909,0.0,1.0,1.0,0.0,0.0,0,0,0,0,1,0,0,0
3,0.353818,0.000,0.000000,0.016908,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,1,0,0
4,0.287881,0.125,0.111111,0.023984,0.0,0.0,0.0,0.0,1.0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0.614269,0.000,0.000000,0.015713,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,1,0,0
414,0.512066,0.000,0.000000,0.212559,1.0,0.0,0.0,1.0,0.0,0,0,1,0,0,0,0,0
415,0.505473,0.000,0.000000,0.014151,0.0,0.0,1.0,0.0,1.0,0,0,0,0,1,0,0,0
416,0.614269,0.000,0.000000,0.015713,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,1,0,0


In [42]:
# Nulos del conjunto de entrenamiento
train_df.isna().sum()

Age            0
SibSp          0
Parch          0
Fare           0
Pclass__1      0
Pclass__2      0
Sex__male      0
Embarked__C    0
Embarked__S    0
Survived       0
Cabin__A       0
Cabin__B       0
Cabin__C       0
Cabin__D       0
Cabin__E       0
Cabin__F       0
Cabin__G       0
Cabin__T       0
dtype: int64

In [43]:
# Nulos del conjunto de testing
test_df.isna().sum()

Age            0
SibSp          0
Parch          0
Fare           0
Pclass__1      0
Pclass__2      0
Sex__male      0
Embarked__C    0
Embarked__S    0
Cabin__A       0
Cabin__B       0
Cabin__C       0
Cabin__D       0
Cabin__E       0
Cabin__F       0
Cabin__G       0
Cabin__T       0
dtype: int64

## Guardo los conjuntos de datos 

In [44]:
# Una vez procesados los conjuntos de entrenamiento y testing, proceso a almacenarlos como archivos .csv
import os

datasets__route = "../data/processed/"

if not os.path.exists(datasets__route):
    os.mkdir(datasets__route)
    
# Guardo el conjunto de entrenamiento
train_df.to_csv(os.path.join(datasets__route, "train.csv"), index = False)
print('Conjunto de entrenamiento guardado con exito.')

# Guardo el conjunto de testing
test_df.to_csv(os.path.join(datasets__route, "test.csv"), index = False)
print('Conjunto de testing guardado con exito.')

Conjunto de entrenamiento guardado con exito.
Conjunto de testing guardado con exito.
