In [8]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

In [9]:
from sklearn.model_selection import train_test_split 

In [10]:
df = pd.read_csv('titanic_1000rows_with_missing.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Fare,Cabin
0,1.0,0,,Person_0,female,17.2,49.16,
1,2.0,1,3.0,,male,,43.4,
2,3.0,0,,Person_2,,,32.19,D12
3,4.0,0,,Person_3,male,,,D12
4,5.0,0,,,female,40.8,74.36,D12


In [11]:
x_train, x_test, y_train, y_test = train_test_split(
    df.drop('Survived', axis=1), 
    df['Survived'], 
    test_size=0.3, 
    random_state=2
)

x_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Fare,Cabin
90,91.0,1.0,Person_90,male,26.7,,
305,306.0,,Person_305,male,53.7,41.75,C23
126,127.0,3.0,Person_126,,44.2,,
963,964.0,3.0,Person_963,male,22.6,65.29,
163,164.0,1.0,Person_163,female,0.2,,D12


In [12]:
x_train.isnull().mean() * 100

PassengerId    21.000000
Pclass         19.714286
Name           20.714286
Sex            21.000000
Age            20.285714
Fare           20.428571
Cabin          60.142857
dtype: float64

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Define column types
numerical_features = ['PassengerId', 'Pclass', 'Age', 'Fare']
categorical_features = ['Sex', 'Cabin']

# Numerical pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())  # Optional: scale numerical features
])

# Categorical pipeline
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine pipelines
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

# Apply preprocessing
x_train_processed = preprocessor.fit_transform(x_train)

print(f"Processed data shape: {x_train_processed.shape}")

Processed data shape: (700, 12)
