In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings

warnings.filterwarnings("ignore", category=RuntimeWarning)

**Predecir la supervivencia o no de los pasajeros**

* Tipo de Prediccion:
    * Clasificacion binaria
* Matrica principal:
    * Precision (Acurrancy)


**Features**
| Variable  | Definición                                            | Key                                      |
|-----------|--------------------------------------------------------|------------------------------------------|
| survival  | Supervivencia                                          | 0 = No, 1 = Sí                            |
| pclass    | Clase del boleto                                       | 1 = 1ra, 2 = 2da, 3 = 3ra                 |
| sex       | Sexo                                                   | (No aplica: valores tipo texto: male/female) |
| age       | Edad en años                                           | (Valores numéricos)                      |
| sibsp     | Nº de hermanos/esposos a bordo                         | (Valores numéricos)                      |
| parch     | Nº de padres/hijos a bordo                             | (Valores numéricos)                      |
| ticket    | Número del boleto                                      | (Texto o numérico)                       |
| fare      | Tarifa del pasajero                                    | (Valores numéricos)                      |
| cabin     | Número de camarote                                     | (Texto: puede estar vacío)              |
| embarked  | Puerto de embarque                                     | C = Cherbourg, Q = Queenstown, S = Southampton |

  

| Variable | Definición                                                                 | Valores / Claves                                  | Observaciones                                                                 |
|----------|-----------------------------------------------------------------------------|---------------------------------------------------|--------------------------------------------------------------------------------|
| pclass   | Clase del boleto como aproximación al estatus socioeconómico (SES)         | 1 = Superior, 2 = Medio, 3 = Inferior              | Útil como proxy de riqueza y acceso                                           |
| age      | Edad del pasajero                                                          | Fraccionaria si < 1, estimada como xx.5           | Edad puede no ser exacta; niños <1 representados con decimales                |
| sibsp    | Nº de hermanos/esposos a bordo                                              | Numérico                                           | Sibling = hermano/a o hermanastro/a, Spouse = esposo/a; no incluye prometidos |
| parch    | Nº de padres/hijos a bordo                                                  | Numérico                                           | Parent = madre/padre, Child = hijo/a o hijastro/a; niñeras no se cuentan      |

In [87]:
# Configuración para visualizar todos los columnas/filas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Configuración para gráficos
sns.set_style('whitegrid')
plt.style.use('seaborn-v0_8-darkgrid')

In [88]:
#Cargar los datos
train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')
print('Data loaded')

Data loaded


**Variable: Título (Ordinal Encoding)
La variable Título se codificará usando Ordinal Encoding para asignar valores numéricos que reflejen una jerarquía de estatus.**

Prioridad 3 -> (Estatus Alto): Countess, Lady, Jonkheer

Prioridad 2 -> (Mujeres y Niños): Miss, Mrs., Mme, Mlle, Ms., Master, Dona

Prioridad 1 -> (Títulos Profesionales/Militares): Dr., Rev., Col., Capt., Sir, Major, Don

Prioridad 0 -> (Hombre Joven/Señor): Mr.

In [89]:
#Combined test and train
df_train_target = train_df['Survived']
df_train_test = pd.concat([train_df.drop('Survived', axis=1), test_df], ignore_index=True)
df_train_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Pclass       1309 non-null   int64  
 2   Name         1309 non-null   object 
 3   Sex          1309 non-null   object 
 4   Age          1046 non-null   float64
 5   SibSp        1309 non-null   int64  
 6   Parch        1309 non-null   int64  
 7   Ticket       1309 non-null   object 
 8   Fare         1308 non-null   float64
 9   Cabin        295 non-null    object 
 10  Embarked     1307 non-null   object 
dtypes: float64(2), int64(4), object(5)
memory usage: 112.6+ KB


In [90]:
#Valores faltantes de puerto de embarque, se imputara con la moda
df_train_test['Embarked'] = df_train_test['Embarked'].fillna(df_train_test['Embarked'].mode()[0])
pd.unique(df_train_test['Embarked'])
#Valores faltantes en tarifa, se imputaran con la media
df_train_test['Fare'] = df_train_test['Fare'].fillna(df_train_test['Fare'].median())
#Imputar valores faltantes de feature Cabin
#Rellenar valores faltantes con la categoria "Unknown"
#Crear una feature "Deck" para obtener la inicia de la cubierta
df_train_test['Cabin'] = df_train_test['Cabin'].fillna("Unknown")
df_train_test["Deck"] = df_train_test['Cabin'].transform(lambda x: x[0])
df_train_test = df_train_test.drop('Cabin', axis=1)
print('Embarked feature imputed')
print('Fare feature imputed')
print('Cabin feature imputed')

Embarked feature imputed
Fare feature imputed
Cabin feature imputed


In [91]:
def filter_title(name = ""):
    title_result = re.search(' ([A-Za-z]+)\\.', name)
    if title_result:
        return title_result.group(1)
    return ""

result = pd.unique(df_train_test['Name'].apply(filter_title))
print('Titulos dentro del dataFrame: ')
print(result)
print(len(result))

Titulos dentro del dataFrame: 
['Mr' 'Mrs' 'Miss' 'Master' 'Don' 'Rev' 'Dr' 'Mme' 'Ms' 'Major' 'Lady'
 'Sir' 'Mlle' 'Col' 'Capt' 'Countess' 'Jonkheer' 'Dona']
18


In [92]:
#Crear nueva feature "Title"
def generate_title_ordenc(title):
    if title in ('Countess', 'Lady', 'Jonkheer'):
        return 3
    elif title in ('Miss', 'Mrs', 'Mme', 'Mlle', 'Ms', 'Master', 'Dona'):
        return 2
    elif title in ('Dr', 'Rev', 'Col', 'Capt', 'Sir', 'Major', 'Don'):
        return 1
    return 0 #('Mr')

df_train_test['Title'] = df_train_test['Name'].apply(filter_title)
df_train_test = df_train_test.drop(['Name'], axis=1)
print('Title Generated')

Title Generated


In [93]:
#Pclass, Aplicaremos tambien una conversion de categoria: Ordinal encoding dato mas importancia a la primera clase
# 1ra clase -> 2, 2da clase -> 1, 3ra clase -> 0
def generate_class_ordenc(class_):
    if class_ == 1:
        return 2
    elif class_ == 2:
        return 1
    return 0
df_train_test['Pclass'] = df_train_test['Pclass'].apply(generate_class_ordenc)

In [94]:
#Basados en el titulo del pasajero y la tarifa que pago se imputaran los valores faltantes de "Age"
df_train_test['Age'] = df_train_test.groupby(['Title', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))
df_train_test['Age'] = df_train_test['Age'].fillna(df_train_test['Age'].median())
print('Age imputed with median groupby Title, Pclass')

Age imputed with median groupby Title, Pclass


In [95]:
#Manejo de variable sex
#Codificación Binaria Simple (0 o 1)
df_train_test['Sex'] = df_train_test['Sex'].map({ 'female': 1, 'male': 0 }).astype(int)
print('Sex Binaria Simple (0 o 1)')

Sex Binaria Simple (0 o 1)


In [96]:
#Conversion de variables categoricas
#Conversion de SibSp y Parch
df_train_test['Family_Size'] = df_train_test['SibSp'] + df_train_test['Parch'] + 1
df_train_test = df_train_test.drop(['SibSp', 'Parch'], axis=1)
df_train_test['Is_Alone'] = (df_train_test['Family_Size'] == 1).astype(int)
#Manejar Ticket como feature binaria (AGENCIA, PARTICULAR)
def filter_prefix(ticket):
    match = re.match(r'([A-Za-z\./]+)', ticket)
    if match:
        prefix = match.group(1).replace('.', '').replace('/', '').upper()
        return prefix
    return 'NoAgency'
df_train_test['Ticket_Agency'] = df_train_test['Ticket'].apply(filter_prefix)
ticket_counts = df_train_test['Ticket'].value_counts()  
df_train_test['Ticket_Count'] = df_train_test['Ticket'].map(ticket_counts)
df_train_test = df_train_test.drop('Ticket', axis=1)
df_train_test = pd.get_dummies(df_train_test, columns=['Ticket_Agency'], prefix='Ag', drop_first=True, dtype=np.int8)
#Conversion de Embarked
df_train_test = pd.get_dummies(df_train_test, columns=['Embarked'], prefix='Embarked', drop_first=True, dtype=np.int8)
#Conversion de Deck
df_train_test = pd.get_dummies(df_train_test, columns=['Deck'], prefix='Deck', drop_first=True, dtype=np.int8)
#Conversion Title a numeric
df_train_test['Title'] = df_train_test['Title'].apply(generate_title_ordenc)
df_train_test

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Title,Family_Size,Is_Alone,Ticket_Count,Ag_AQ,Ag_AS,Ag_C,Ag_CA,Ag_CASOTON,Ag_FA,Ag_FC,Ag_FCC,Ag_LINE,Ag_LP,Ag_NoAgency,Ag_PC,Ag_PP,Ag_PPP,Ag_SC,Ag_SCA,Ag_SCAH,Ag_SCOW,Ag_SCPARIS,Ag_SOC,Ag_SOP,Ag_SOPP,Ag_SOTONO,Ag_SOTONOQ,Ag_SP,Ag_STONO,Ag_STONOQ,Ag_SWPP,Ag_WC,Ag_WEP,Embarked_Q,Embarked_S,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_U
0,1,0,0,22.0,7.25,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
1,2,2,1,38.0,71.2833,2,2,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,3,0,1,26.0,7.925,2,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1
3,4,2,1,35.0,53.1,2,2,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0
4,5,0,0,35.0,8.05,0,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
5,6,0,0,26.0,8.4583,0,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
6,7,2,0,54.0,51.8625,0,1,1,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
7,8,0,0,2.0,21.075,2,5,0,5,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
8,9,0,1,27.0,11.1333,2,3,0,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
9,10,1,1,14.0,30.0708,2,2,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [97]:
df_train_res = df_train_test.iloc[:len(train_df)].copy()
df_test_res = df_train_test.iloc[len(train_df):].copy()
df_train_res['Survived'] = df_train_target
#Generar nuevos archivos procesados
df_train_res.to_csv('../''data/processed/train_processed.csv')
df_test_res.to_csv('../''data/processed/test_processed.csv')