In [6]:
import pandas as pd
import numpy as np
import os
import pickle

train = pd.read_csv('../data/train.csv')
test  = pd.read_csv('../data/test.csv')

train_len = len(train)
all_data = pd.concat([train, test], sort=False).reset_index(drop=True)

# Embarked → moda
all_data['Embarked'].fillna(all_data['Embarked'].mode()[0], inplace=True)

# Age → mediana por Pclass y Sex (uso transform para mantener el índice)
all_data['Age'] = all_data.groupby(['Pclass','Sex'])['Age'] \
                          .transform(lambda grp: grp.fillna(grp.median()))

# Fare → mediana
all_data['Fare'].fillna(all_data['Fare'].median(), inplace=True)


all_data['Cabin'] = all_data['Cabin'].fillna('Missing')
all_data['Deck']  = all_data['Cabin'].str[0]

# FamilySize e IsAlone
all_data['FamilySize'] = all_data['SibSp'] + all_data['Parch'] + 1
all_data['IsAlone']    = (all_data['FamilySize']==1).astype(int)
# Title desde Name
all_data['Title'] = all_data['Name'].str.extract(r',\s*([^\.]+)\.', expand=False)
# Simplificar rare titles
rare_titles = ['Lady','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona']
all_data['Title'] = all_data['Title'].replace(rare_titles, 'Rare')

all_data = pd.get_dummies(all_data,
                          columns=['Sex','Embarked','Deck','Title'],
                          drop_first=True)

x = all_data.iloc[:train_len].drop(['PassengerId','Survived','Name','Ticket','Cabin'], axis=1)
y = train['Survived']
X_test = all_data.iloc[train_len:].drop(['PassengerId','Survived','Name','Ticket','Cabin'], axis=1)
test = pd.read_csv('../data/test.csv')

os.makedirs('models', exist_ok=True)

with open('models/preprocessed.pkl','wb') as f:
    pickle.dump((x, y, X_test, test), f)
print("Guardado preprocessed.pkl con x, y, X_test y test")

df_pre = x.copy()
df_pre['Survived'] = y

df_pre.sample(100, random_state=42) \
      .to_csv('../data/titanic_preprocessed_sample.csv', index=False)

print("Guardado data/titanic_preprocessed_sample.csv (100 filas)")

Guardado preprocessed.pkl con x, y, X_test y test
Guardado data/titanic_preprocessed_sample.csv (100 filas)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_data['Embarked'].fillna(all_data['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_data['Fare'].fillna(all_data['Fare'].median(), inplace=True)
