In [1]:
# Importando bibliotecas necesarias
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [3]:
# Cargar el dataset
df_train = pd.read_csv('train.csv')

In [4]:
# Tratamiento de valores faltantes (similar al Notebook #1)
df_train['Age'].fillna(df_train['Age'].median(), inplace=True)
df_train['Cabin_present'] = df_train['Cabin'].notnull().astype(int)
df_train.drop(columns=['Cabin'], inplace=True)
df_train['Embarked'].fillna(df_train['Embarked'].mode()[0], inplace=True)

In [5]:
# Selección y transformación de características
X = df_train.drop(['Survived', 'Name', 'Ticket'], axis=1)
y = df_train['Survived']

In [6]:
# Identificación de características numéricas y categóricas
numeric_features = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
categorical_features = ['Sex', 'Embarked', 'Cabin_present']

In [7]:
# Pipelines para preprocesamiento
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [8]:
# Aplicar el preprocesamiento
X_preprocessed = preprocessor.fit_transform(X)


In [9]:
# División en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

In [14]:
# Guardando el dataset procesado
processed_df = pd.DataFrame(X_preprocessed)
processed_df['Survived'] = y.reset_index(drop=True)
processed_df.to_csv('processed_dataset.csv', index=False)