In [33]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

In [34]:
from sklearn.model_selection import train_test_split 

In [35]:
df = pd.read_csv('titanic_1000rows_with_missing.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Fare,Cabin
0,1.0,0,,Person_0,female,17.2,49.16,
1,2.0,1,3.0,,male,,43.4,
2,3.0,0,,Person_2,,,32.19,D12
3,4.0,0,,Person_3,male,,,D12
4,5.0,0,,,female,40.8,74.36,D12


In [36]:
x_train, x_test, y_train, y_test = train_test_split(
    df.drop('Survived', axis=1), 
    df['Survived'], 
    test_size=0.3, 
    random_state=2
)

x_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Fare,Cabin
90,91.0,1.0,Person_90,male,26.7,,
305,306.0,,Person_305,male,53.7,41.75,C23
126,127.0,3.0,Person_126,,44.2,,
963,964.0,3.0,Person_963,male,22.6,65.29,
163,164.0,1.0,Person_163,female,0.2,,D12


In [37]:
x_train.isnull().mean() * 100

PassengerId    21.000000
Pclass         19.714286
Name           20.714286
Sex            21.000000
Age            20.285714
Fare           20.428571
Cabin          60.142857
dtype: float64

In [41]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Define column types
numerical_features = ['PassengerId', 'Pclass', 'Age', 'Fare']
categorical_features = ['Sex', 'Cabin']

# Numerical pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())  # Optional: scale numerical features
])

# Categorical pipeline
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine pipelines
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

# 1. Apply preprocessing
x_train_processed = preprocessor.fit_transform(x_train)

# 2. Get column names
feature_names = preprocessor.get_feature_names_out()

# 3. Convert to DataFrame
x_train_processed = pd.DataFrame(x_train_processed, columns=feature_names)

# 4. View DataFrame
x_train_processed

Unnamed: 0,num__PassengerId,num__Pclass,num__Age,num__Fare,cat__Sex_Missing,cat__Sex_female,cat__Sex_male,cat__Cabin_B45,cat__Cabin_C23,cat__Cabin_D12,cat__Cabin_E31,cat__Cabin_Missing
0,-1.579412,-1.313571,-0.297405,-0.023544,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,-0.751688,0.024857,1.805774,-0.423251,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,-1.440816,1.363284,1.065767,-0.023544,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.781530,1.363284,-0.616777,0.531989,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,-1.298371,-1.313571,-2.361637,-0.023544,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,0.002887,0.024857,-0.359722,0.176919,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
696,0.002887,1.363284,-0.359722,-1.770894,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
697,-0.027912,0.024857,-0.429828,-0.023544,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
698,0.102984,0.024857,-0.671304,-0.134325,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
