## Setup

In [1]:
%cd ..

/Users/cesarchalco/Documents/projects/Machine-Learning-Engineering-with-Python-Second-Edition/Chapter03/practicing_sklearn_pipelines


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
import pandas as pd
import sklearn as sk
import numpy as np

In [3]:
SEED = 42

In [4]:
TARGET = "Personality"

## Data collection

In [5]:
data = pd.read_csv("data/personality_dataset.csv")

In [6]:
data.sample(5, random_state=SEED)

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
141,10.0,Yes,3.0,3.0,Yes,5.0,3.0,Introvert
1557,4.505816,No,3.963354,5.0,No,14.0,5.0,Extrovert
485,5.0,Yes,0.0,0.0,Yes,0.0,2.0,Introvert
1712,10.0,Yes,2.0,2.0,Yes,2.0,1.0,Introvert
2250,7.0,Yes,0.0,0.0,Yes,2.0,1.0,Introvert


In [7]:
target_mapping = {
    "Introvert": 1,
    "Extrovert": 0
}

In [8]:
data["Personality"] = data["Personality"].map(target_mapping)

## Load and split

In [9]:
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(
    data.drop(columns=[TARGET]),
    data[TARGET],
    test_size=0.2,
    random_state=SEED
)

## Advanced preprocessing

### Creating lists

In [10]:
num_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

In [11]:
X_train.shape[1], len(num_features), len(cat_features)

(7, 5, 2)

In [12]:
needed_feats_for_engagement_score = [
    "Social_event_attendance",
    "Going_outside",
    "Post_frequency",
    "Time_spent_Alone"
]

In [13]:
needed_feats_for_introvert_trend = [
    "Time_spent_Alone",
    "Stage_fear"
]

In [14]:
needed_feats_for_recovery_ratio = [
    "Friends_circle_size",
    "Drained_after_socializing"
]

### Defining pipelines

1) Impute numerical features with median

In [15]:
imputator = sk.compose.ColumnTransformer(
    transformers=[
        ("impute", sk.impute.SimpleImputer(strategy="median"), num_features),
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
    force_int_remainder_cols=False
).set_output(transform="pandas")

2) Create custom classes for the feature creation

In [16]:
needed_feats_for_engagement_score

['Social_event_attendance',
 'Going_outside',
 'Post_frequency',
 'Time_spent_Alone']

In [17]:
class EngagementScoreCalculator(sk.base.BaseEstimator, sk.base.TransformerMixin):
    def __init__(self):
        super().__init__()

    def fit(self, X, y=None):
        self.social_event_attendance_idx = 0
        self.going_outside_idx = 1
        self.post_frequency_idx = 2
        self.time_spent_alone_idx = 3
        return self

    def transform(self, X):
        numerator = X.iloc[:, self.social_event_attendance_idx] + X.iloc[:, self.going_outside_idx] + X.iloc[:, self.post_frequency_idx]
        denominator = 3 * (X.iloc[:, self.time_spent_alone_idx] + 1)
        engagement_score = numerator / denominator
        return engagement_score
    
    def get_feature_names_out(self, input_features=None):
        return ["Engagement_Score"]

In [18]:
needed_feats_for_introvert_trend

['Time_spent_Alone', 'Stage_fear']

In [19]:
class IntrovertTrendCalculator(sk.base.BaseEstimator, sk.base.TransformerMixin):
    def __init__(self):
        super().__init__()

    def fit(self, X, y=None):
        self.time_spent_alone_idx = 0
        self.stage_fear_idx = 1
        return self

    def transform(self, X):
        introvert_trend_flag = np.where(
            (X.iloc[:, self.time_spent_alone_idx] > 6) & (X.iloc[:, self.stage_fear_idx] == "Yes"),
            1,
            0
        )
        return introvert_trend_flag
    
    def get_feature_names_out(self, input_features=None):
        return ["Introvert_Trend"]

In [20]:
needed_feats_for_recovery_ratio

['Friends_circle_size', 'Drained_after_socializing']

In [21]:
class RecoveryRatioCalculator(sk.base.BaseEstimator, sk.base.TransformerMixin):
    def __init__(self):
        super().__init__()

    def fit(self, X, y=None):
        self.friends_circle_size_idx = 0
        self.drained_after_socializing_idx = 1
        return self

    def transform(self, X):
        flag_drained_after_socializing = np.where(
            X.iloc[:, self.drained_after_socializing_idx] == "Yes",
            1,
            0
        )
        recovery_ratio = X.iloc[:, self.friends_circle_size_idx] / (flag_drained_after_socializing + 1)
        return recovery_ratio

    def get_feature_names_out(self, input_features=None):
        return ["Recovery_Ratio"]

In [22]:
used_feats = list(set(needed_feats_for_engagement_score + needed_feats_for_introvert_trend + needed_feats_for_recovery_ratio))

In [23]:
feats_creator = sk.compose.ColumnTransformer(
    transformers=[
        ("engagement_score", EngagementScoreCalculator(), needed_feats_for_engagement_score),
        ("introvert_trend", IntrovertTrendCalculator(), needed_feats_for_introvert_trend),
        ("recovery_ratio", RecoveryRatioCalculator(), needed_feats_for_recovery_ratio),
        ("passthrough", "passthrough", used_feats)
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
    force_int_remainder_cols=False
).set_output(transform="pandas")

3) Define the scaling and categorical mapping

In [24]:
yesno_to_binary = sk.preprocessing.FunctionTransformer(
    func=lambda X: X.map(lambda v: 1 if v == "Yes" else 0),
    validate=False
)

In [25]:
scaler_flagger = sk.compose.ColumnTransformer(
    transformers=[
        ("scale", sk.preprocessing.StandardScaler(), sk.compose.make_column_selector(dtype_include=np.number)),
        ("map", yesno_to_binary, cat_features)
    ],
    verbose_feature_names_out=True,
    force_int_remainder_cols=False
).set_output(transform="pandas")

4) Assemble all the preprocessing pipeline

In [26]:
preprocesssing_pipe = sk.pipeline.Pipeline(
    steps=[
        ("imputator", imputator),
        ("feats_creator", feats_creator),
        ("scaler_flagger", scaler_flagger)
    ],
    verbose=True
).set_output(transform="pandas")

In [27]:
preprocesssing_pipe

5) Fit and test the pipeline

In [28]:
preprocesssing_pipe.fit(X_train, y_train)

[Pipeline] ......... (step 1 of 3) Processing imputator, total=   0.0s
[Pipeline] ..... (step 2 of 3) Processing feats_creator, total=   0.0s
[Pipeline] .... (step 3 of 3) Processing scaler_flagger, total=   0.0s


In [29]:
X_train.head(5)

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
2078,3.0,No,5.0,6.0,No,10.0,9.0
163,6.0,Yes,3.0,3.0,Yes,1.0,0.0
1938,10.0,Yes,2.0,2.0,Yes,1.0,0.0
252,9.0,Yes,1.0,2.0,Yes,3.0,1.0
2232,10.0,Yes,0.0,0.0,Yes,4.0,1.0


In [30]:
preprocesssing_pipe.transform(X_train).head(5)

Unnamed: 0,scale__Engagement_Score,scale__Introvert_Trend,scale__Recovery_Ratio,scale__Going_outside,scale__Friends_circle_size,scale__Time_spent_Alone,scale__Social_event_attendance,scale__Post_frequency,map__Stage_fear,map__Drained_after_socializing
2078,0.053326,-0.662726,0.942602,1.348934,0.889367,-0.437476,0.363759,1.876505,0,0
163,-0.676836,-0.662726,-1.081621,-0.001941,-1.242188,0.434649,-0.332327,-1.229353,1,1
1938,-0.763814,1.508919,-1.081621,-0.452233,-1.242188,1.597482,-0.680371,-1.229353,1,1
252,-0.757406,1.508919,-0.868545,-0.452233,-0.768509,1.306774,-1.028414,-0.884258,1,1
2232,-0.811882,1.508919,-0.762007,-1.352816,-0.531669,1.597482,-1.376457,-0.884258,1,1


6) Add an estimator

In [31]:
import lightgbm as lgb

In [32]:
params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "learning_rate": 0.01,
    "random_state": SEED,
    "is_unbalance": False
}

In [33]:
preprocessing_and_estimator_pipe = sk.pipeline.Pipeline(
    steps=[
        ("preprocessing", preprocesssing_pipe),
        ("estimator", lgb.LGBMClassifier(**params))
    ],
    verbose=True
).set_output(transform="pandas")

In [34]:
preprocessing_and_estimator_pipe.fit(X_train, y_train)

[Pipeline] ......... (step 1 of 3) Processing imputator, total=   0.0s
[Pipeline] ..... (step 2 of 3) Processing feats_creator, total=   0.0s
[Pipeline] .... (step 3 of 3) Processing scaler_flagger, total=   0.0s
[Pipeline] ..... (step 1 of 2) Processing preprocessing, total=   0.0s
[LightGBM] [Info] Number of positive: 1131, number of negative: 1189
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000177 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 220
[LightGBM] [Info] Number of data points in the train set: 2320, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.487500 -> initscore=-0.050010
[LightGBM] [Info] Start training from score -0.050010
[Pipeline] ......... (step 2 of 2) Processing estimator, total=   0.3s


In [35]:
pred_probs_y_test = preprocessing_and_estimator_pipe.predict_proba(X_test)[:, 1]

In [36]:
pred_y_test = np.where(pred_probs_y_test >= 0.5, 1, 0)

In [37]:
sk.metrics.roc_auc_score(y_test, pred_probs_y_test)

np.float64(0.9511768068988518)