In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
data = sns.load_dataset("titanic").drop(columns=["alive"])
X = data.drop(columns=["survived"])
y = data["survived"].fillna(0)

In [3]:
def make_transformer(func):
    class FunctionTransformer(BaseEstimator, TransformerMixin):
        def fit(self, X, y=None):
            return self
        
        def transform(self, X):
            return func(X)
    
    return FunctionTransformer()

@make_transformer
def data_preparing(df):
    df = df.copy()
    bins = [0, 12, 20, 40, 60, np.inf]
    labels = ["child", "teen", "young", "middle", "senior"]
    df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels)
    return df

In [4]:
# Розділяємо ознаки на числові та категорійні
num_features = ["age", "fare"]
cat_features = ["sex", "class", "embark_town"]

# Конвеєр для числових ознак
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

# Конвеєр для категорійних ознак
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore")),
])

# Комбінуємо в один трансформер
preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features),
])

In [5]:
model = RandomForestClassifier(n_estimators=128, random_state=42)

In [6]:
full_pipeline = Pipeline([
    ("age_binner", data_preparing),
    ("preprocessor", preprocessor),
    ("classifier", model)
])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
full_pipeline.fit(X_train, y_train)
y_pred = full_pipeline.predict(X_test)

In [8]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.7989
