In [83]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [84]:
df = pd.read_csv('train.csv')
print(f"{np.round(df.isna().mean(), 4)*100} %")
df['family'] = df['SibSp'] + df['Parch']
df = df.drop(columns=['Name', 'Cabin', 'Ticket', 'SibSp', 'Parch'])

df.head(3)

PassengerId     0.00
Survived        0.00
Pclass          0.00
Name            0.00
Sex             0.00
Age            19.87
SibSp           0.00
Parch           0.00
Ticket          0.00
Fare            0.00
Cabin          77.10
Embarked        0.22
dtype: float64 %


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,family
0,1,0,3,male,22.0,7.25,S,1
1,2,1,1,female,38.0,71.2833,C,1
2,3,1,3,female,26.0,7.925,S,0


In [85]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked',
       'family'],
      dtype='object')

In [86]:
x = df.drop(columns='Survived')
y = df[['Survived']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

ki = KNNImputer(n_neighbors=3, weights='distance')


num_cols = ['PassengerId', 'Pclass', 'Age', 'Fare', 'Embarked', 'family']
cat_cols = ['Sex', 'Embarked']


In [87]:
# encoding
cat_trf = ColumnTransformer([
    ('c1', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), cat_cols)
], remainder='passthrough')


pipe = Pipeline([
    ('p1', cat_trf),
    # ('p2', StandardScaler()),
    ('p3', KNNImputer(n_neighbors=5))
])


pipe.fit(x_train)
# 2. NOW you can extract the names (because it's fitted)
ohe_cols = pipe.named_steps['p1'].named_transformers_['c1'].get_feature_names_out(cat_cols)
# 3. Get passthrough columns
passthrough_cols = [col for col in x_train.columns if col not in cat_cols]
# 4. Combine
all_cols = np.concatenate([ohe_cols, passthrough_cols])
# 5. Transform and Create DataFrame
x_train_trf = pd.DataFrame(pipe.transform(x_train), columns=all_cols)
x_train_trf


Unnamed: 0,Sex_male,Embarked_Q,Embarked_S,Embarked_nan,PassengerId,Pclass,Age,Fare,family
0,1.0,0.0,1.0,0.0,332.0,1.0,45.5,28.5000,0.0
1,1.0,0.0,1.0,0.0,734.0,2.0,23.0,13.0000,0.0
2,1.0,0.0,1.0,0.0,383.0,3.0,32.0,7.9250,0.0
3,1.0,0.0,1.0,0.0,705.0,3.0,26.0,7.8542,1.0
4,0.0,0.0,1.0,0.0,814.0,3.0,6.0,31.2750,6.0
...,...,...,...,...,...,...,...,...,...
707,0.0,0.0,1.0,0.0,107.0,3.0,21.0,7.6500,0.0
708,1.0,0.0,1.0,0.0,271.0,1.0,27.6,31.0000,0.0
709,1.0,0.0,1.0,0.0,861.0,3.0,41.0,14.1083,2.0
710,0.0,0.0,1.0,0.0,436.0,1.0,14.0,120.0000,3.0


In [88]:
cols = cat_trf.named_transformers_['c1'].get_feature_names_out()
cols


array(['Sex_male', 'Embarked_Q', 'Embarked_S', 'Embarked_nan'],
      dtype=object)

In [95]:
# making prediction on non-scaled data

pipe_no_scale = Pipeline([
    ('p1', cat_trf),          
    ('p3', KNNImputer(n_neighbors=5))
])


x_train_no_scale = pipe_no_scale.fit_transform(x_train) 
x_test_no_scale = pipe_no_scale.transform(x_test)


lr = LogisticRegression()
lr.fit(x_train_no_scale, y_train)
pred_1 = lr.predict(x_test_no_scale)

print(f"ACCURACY OF NON-SCALED DATA : {np.round(accuracy_score(pred_1, y_test),2)*100} %")


ACCURACY OF NON-SCALED DATA : 78.0 %


In [90]:
# making prediction on scaled data
lr = LogisticRegression()

x_test_trf = pipe.transform(x_test)

lr.fit(x_train_trf, y_train)
pred_2 = lr.predict(x_test_trf)

print(f"ACCURACY OF SCALED DATA : {np.round(accuracy_score(pred_2, y_test),2)*100} %")

ACCURACY OF SCALED DATA : 78.0 %
