In [285]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Binarizer
from sklearn.linear_model import SGDClassifier
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.impute import SimpleImputer, KNNImputer
import seaborn as sns
import pandas as pd

In [286]:
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [287]:
titanic.dropna(inplace=True)
y = titanic['survived']
X = titanic.drop('survived', axis=1)

In [288]:
# numerical_features = ['pclass', 'age', 'fare']
# categorical_features = ["sex", "deck", "alone"]

numerical_features = make_column_selector(dtype_include=np.number)
categorical_features = make_column_selector(dtype_include=object)

In [289]:
"""
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

categorical_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore')
)

encoded_features = categorical_pipeline.fit_transform(X[categorical_features]).toarray()

# Get the feature names after one-hot encoding
feature_names = categorical_pipeline.named_steps['onehotencoder'].get_feature_names_out(input_features=categorical_features)

# Create a DataFrame with the encoded features and feature names
encoded_df = pd.DataFrame(encoded_features, columns=feature_names)

print(encoded_df.head())
"""

"\ncategorical_features = X.select_dtypes(include=['object']).columns.tolist()\n\ncategorical_pipeline = make_pipeline(\n    SimpleImputer(strategy='most_frequent'),\n    OneHotEncoder(handle_unknown='ignore')\n)\n\nencoded_features = categorical_pipeline.fit_transform(X[categorical_features]).toarray()\n\n# Get the feature names after one-hot encoding\nfeature_names = categorical_pipeline.named_steps['onehotencoder'].get_feature_names_out(input_features=categorical_features)\n\n# Create a DataFrame with the encoded features and feature names\nencoded_df = pd.DataFrame(encoded_features, columns=feature_names)\n\nprint(encoded_df.head())\n"

In [290]:

numerical_pipeline = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='mean'),
    StandardScaler()
)

categorical_pipeline = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore')
)

print(categorical_pipeline.fit_transform(X[categorical_features]).toarray())

[[1. 0. 1. ... 0. 0. 1.]
 [1. 0. 0. ... 1. 0. 1.]
 [0. 1. 0. ... 1. 1. 0.]
 ...
 [1. 0. 1. ... 0. 0. 1.]
 [1. 0. 0. ... 1. 0. 1.]
 [0. 1. 1. ... 0. 0. 1.]]


In [291]:
preprocessor = make_column_transformer(
    (numerical_pipeline, numerical_features),
    (categorical_pipeline, categorical_features)
)


In [292]:
model = make_pipeline(preprocessor, SGDClassifier())
model.fit(X, y)

print(model.predict(X[:5]))
print(model.score(X, y))

[1 1 0 1 1]
1.0


# Pipelines Union

In [293]:
from sklearn.pipeline import make_union

In [294]:
numerical_features = X[['age', 'fare']]

In [295]:
pipeline = make_union(StandardScaler(), Binarizer(threshold=30))

In [296]:
both_result = pipeline.fit_transform(numerical_features)
print(both_result[:5])
print(both_result.shape)


[[ 0.15208196 -0.10011013  1.          1.        ]
 [-0.03987502 -0.33848477  1.          1.        ]
 [ 1.17585249 -0.35470782  1.          1.        ]
 [-2.02343043 -0.81567192  0.          0.        ]
 [ 1.43179512 -0.68654298  1.          0.        ]]
(182, 4)


# Why Imputer() over dropna() : To use optimise GridSearch params

In [297]:
titanic = sns.load_dataset('titanic')
X = titanic[["pclass", "age"]]
y = titanic["survived"]

model = make_pipeline(KNNImputer(), SGDClassifier())

params = {
    'knnimputer__n_neighbors': [1, 2, 3, 4, 5],
}

In [298]:
grid = GridSearchCV(model, param_grid=params, cv=5)

In [299]:
grid.fit(X, y)

In [300]:
grid.best_params_

{'knnimputer__n_neighbors': 2}

In [301]:
grid.best_score_

0.6790659720042683