In [7]:
import pandas as pd
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
from feature_engine.encoding import OneHotEncoder
from feature_engine.transformation import LogTransformer

# Load data
df = sns.load_dataset("titanic")

# Preview the data
print(df.head())
# print(df.isnull().sum())

# Select relevant features 
df = df[["age", "fare", "embarked", "sex", "survived"]].copy()

# Split the features and the target
X = df.drop("survived", axis=1)
y = df["survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train["fare"] = X_train["fare"].replace(0, 0.01)
X_test["fare"] = X_test["fare"].replace(0, 0.01)

# Define a data preprocessing pipeline using Feature-engine
pipeline = Pipeline(steps=[
    ("impute_num", MeanMedianImputer(
        imputation_method="median",
        variables=["age", "fare"])),
    ("imput_cat", CategoricalImputer(
        imputation_method="frequent",
        variables=["embarked"])),
    ("log_transform", LogTransformer(
        variables=["fare"]
    )),
    ("encode_cat", OneHotEncoder(drop_last=True)),
])

# Fit and transform
X_train_clean = pipeline.fit_transform(X_train)
X_test_clean = pipeline.transform(X_test)

# Train a simple classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train_clean, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_clean)
print(classification_report(y_test, y_pred))

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  
              precision    recall  f1-score   support

           0       0.80      0.77      0.79       134
           1       0.67      0.72      0.70        89

    accuracy                           0.75       223
 