In [1]:
from joblib import load
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
import xgboost as xgb
from pathlib import Path
import pandas as pd

TRAINING_DATA_PATH = Path('dataset', 'bank_06.pkl')
raw_data = pd.read_pickle(TRAINING_DATA_PATH)
RANDOM_STATE_ID = 100577770


In [2]:
# create identical pipeline from project 1

num_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'previous', 'pdays_duration', 'prev_contacted']
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

class PdaysTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.median_pdays = None
    
    def fit(self, X, y=None):
        known_pdays = X.loc[X['pdays'] != -1, 'pdays']
        self.median_pdays = known_pdays.median() if len(known_pdays) > 0 else 0
        return self
    
    def transform(self, X):
        X = X.copy()
        X['prev_contacted'] = (X['pdays'] != -1).astype(int)
        X['pdays_duration'] = X['pdays'].replace(-1, self.median_pdays)
        X.drop('pdays', axis=1, inplace=True)
        return X

# Column transformer for scaling and encoding
column_processor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
    ]
)

best_params = {
    'max_depth' : 7, 
    'learning_rate': 0.0683962886025413,
    'n_estimators' : 181
}

pipeline = [
    ('pdays_transform', PdaysTransformer()),
    ('preprocessor', column_processor),
    ('feature_select', "passthrough"),
    ('classifier', xgb.XGBClassifier(
        max_depth=best_params['max_depth'],
        learning_rate=best_params['learning_rate'],
        n_estimators=best_params['n_estimators'],
        random_state=RANDOM_STATE_ID
    ))
]

Define our full pipeline without feature selection, identital to assignment #1. Containing our preprocessing in PdaysTransformer and column_processor, and our model XGBoost in xgb.XGBClassifier.

Next we create our feature selection transformers using F-score and Mutual Information methods, then it to the pipeline.

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
import copy

# create deep copies for each selector type. This ensures that two different xgbClassifier instances are fitted
pipeline_fscore = Pipeline(copy.deepcopy(pipeline))
pipeline_mutualinfo = Pipeline(copy.deepcopy(pipeline))

k_range = list(range(1,18))

feature_select_params_fscore = [
    {
        "feature_select": [SelectKBest(f_classif)],
        "feature_select__k": k_range
    }
]

feature_select_params_mutualinfo = [
    {
        "feature_select": [SelectKBest(mutual_info_classif)],
        "feature_select__k": k_range
    }
]

In [10]:
# use grid search to find optimal k-features to select
from sklearn.model_selection import GridSearchCV

# use Grid Search to evaluate both methods of feature selection for a variety of k values
fscore_grid = GridSearchCV(
    pipeline_fscore,
    feature_select_params_fscore,
    n_jobs=1,
    verbose=1
)

mutualinfo_grid = GridSearchCV(
    pipeline_mutualinfo,
    feature_select_params_mutualinfo,
    n_jobs=1,
    verbose=1
)

# split data to be trained
X = raw_data.drop("deposit", axis=1)
y = raw_data["deposit"].map({"yes": 1, "no": 0})

In [11]:
# run the pipeline with f_score feature selection
fscore_grid.fit(X, y)

Fitting 5 folds for each of 17 candidates, totalling 85 fits


0,1,2
,estimator,"Pipeline(step...=None, ...))])"
,param_grid,"[{'feature_select': [SelectKBest()], 'feature_select__k': [1, 2, ...]}]"
,scoring,
,n_jobs,1
,refit,True
,cv,
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,score_func,<function f_c...t 0x14239d010>
,k,16

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [None]:
#Get the best pipeline
best_pipeline = fscore_grid.best_estimator_

best_selector = best_pipeline.named_steps['feature_select']

#Getting a mask: assigning True for selected features and False for dropped features
feature_mask = best_selector.get_support()

#Get feature names from the Preprocessor
preprocessor = best_pipeline.named_steps['preprocessor']
feature_names_out = preprocessor.get_feature_names_out()

#Created a loop condition to group selected features together and dropped features together
selected_features = [f for f, s in zip(feature_names_out, feature_mask) if s]
dropped_features = [f for f, s in zip(feature_names_out, feature_mask) if not s]

print("Selected Features:", selected_features)
print("Dropped Features:", dropped_features)

Selected Features: ['num__duration', 'num__campaign', 'num__previous', 'num__prev_contacted', 'cat__job_None', 'cat__education_None', 'cat__housing_no', 'cat__housing_yes', 'cat__contact_cellular', 'cat__contact_unknown', 'cat__month_mar', 'cat__month_may', 'cat__month_oct', 'cat__month_sep', 'cat__poutcome_success', 'cat__poutcome_unknown']
Dropped Features: ['num__age', 'num__balance', 'num__day', 'num__pdays_duration', 'cat__job_admin.', 'cat__job_blue-collar', 'cat__job_entrepreneur', 'cat__job_housemaid', 'cat__job_management', 'cat__job_retired', 'cat__job_self-employed', 'cat__job_services', 'cat__job_student', 'cat__job_technician', 'cat__job_unemployed', 'cat__job_unknown', 'cat__marital_divorced', 'cat__marital_married', 'cat__marital_single', 'cat__education_primary', 'cat__education_secondary', 'cat__education_tertiary', 'cat__education_unknown', 'cat__default_no', 'cat__default_yes', 'cat__loan_no', 'cat__loan_yes', 'cat__contact_telephone', 'cat__month_apr', 'cat__month_a

In [None]:
# check fscore stats
print("best k:", fscore_grid.best_params_['feature_select__k'])
print("score of best k:", fscore_grid.best_score_)
print("Selected features:", )


best k: 16
score of best k: 0.7857272727272726


In [13]:
# run the pipeline with mutual info feature selection
mutualinfo_grid.fit(X,y)

Fitting 5 folds for each of 17 candidates, totalling 85 fits


0,1,2
,estimator,"Pipeline(step...=None, ...))])"
,param_grid,"[{'feature_select': [SelectKBest(s... 0x142382c40>)], 'feature_select__k': [1, 2, ...]}]"
,scoring,
,n_jobs,1
,refit,True
,cv,
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,score_func,<function mut...t 0x142382c40>
,k,16

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [23]:
#Get the best pipeline
best_pipeline = mutualinfo_grid.best_estimator_

best_selector = best_pipeline.named_steps['feature_select']

#Getting a mask: assigning True for selected features and False for dropped features
feature_mask = best_selector.get_support()

#Get feature names from the Preprocessor
preprocessor = best_pipeline.named_steps['preprocessor']
feature_names_out = preprocessor.get_feature_names_out()

#Created a loop condition to group selected features together and dropped features together
selected_features = [f for f, s in zip(feature_names_out, feature_mask) if s]
dropped_features = [f for f, s in zip(feature_names_out, feature_mask) if not s]

print("Selected Features:", selected_features)
print("Dropped Features:", dropped_features)

Selected Features: ['num__age', 'num__balance', 'num__day', 'num__duration', 'num__previous', 'num__pdays_duration', 'num__prev_contacted', 'cat__housing_no', 'cat__housing_yes', 'cat__contact_cellular', 'cat__contact_unknown', 'cat__month_may', 'cat__month_oct', 'cat__month_sep', 'cat__poutcome_success', 'cat__poutcome_unknown']
Dropped Features: ['num__campaign', 'cat__job_admin.', 'cat__job_blue-collar', 'cat__job_entrepreneur', 'cat__job_housemaid', 'cat__job_management', 'cat__job_retired', 'cat__job_self-employed', 'cat__job_services', 'cat__job_student', 'cat__job_technician', 'cat__job_unemployed', 'cat__job_unknown', 'cat__job_None', 'cat__marital_divorced', 'cat__marital_married', 'cat__marital_single', 'cat__education_primary', 'cat__education_secondary', 'cat__education_tertiary', 'cat__education_unknown', 'cat__education_None', 'cat__default_no', 'cat__default_yes', 'cat__loan_no', 'cat__loan_yes', 'cat__contact_telephone', 'cat__month_apr', 'cat__month_aug', 'cat__month_d

In [None]:
# check mutual info stats
print("best k:", mutualinfo_grid.best_params_['feature_select__k'])
print("score of best k:", mutualinfo_grid.best_score_)

best k: 16
score of best k: 0.7952727272727274


<div style="max-width: 1000px; font-family: 'Georgia', serif; font-size: 14px; line-height: 1.6;">

# **Comments**

The second pipeline using mutualinfo for ranking performed the best compared with the first pipeline using f-score. The mutualinfo method achieved a best score of 0.795. The f-score achieved a best score of 0.7857. However, results are not improved compared to the first assignment. In the previous assignment, our tuned XGBoost model achieved a best score of 0.8645. This is equivalent to a 8.04% decrease in performance when using our best pipeline (mutualinfo) compared with our best tuned XGBoost model. 


### **Both pipelines have the same number of features (k=16) but they are not all identical.**

#### **These are the features present in the second list but absent from the first list:**

Features included in the first pipeline (f_score) and not in the second one (mutualinfo): 

> - num__campaign, cat__job_None, cat__education_None, cat__month_mar


Features included in the second pipeline (mutualinfo) and not in the first one (f_score):

> - num__age, num__balance, num__day, num__pdays_duration


####  **We now perform an analysis of the features kept by the mutualinfo pipeline:**


The main factors influencing the prediction of whether the client as subscribed to a term deposit appear to be the age of the client, his average yearly balance, the last contact duration with the client and the number of contacts performed befre this campaign are the main drivers in predicting the target variable. This sounds reasonable as firstly, age usually is an important factor for financial wealth (older clients usually have more wealth than younger ones as they had time to accumulate it). Then, yearly balance also is a strong indicator of a client's financial wealth. A relatively high yearly balance means that he/she will have more money for savings and potentially more likely to subscribed to a term deposit. Finally, the final two features mentioned here can be interpreted as indicators of customer satisfication and trust in the bank. This most likely affects the decision of a client to subscribe to a term deposit with the bank. 

On a similar note, previous success appears to be an important driver. The selection of num__previous, num__prev_contacted, and cat__poutcome_success suggests that history repeats itself.This means that a customer who bought from the bank in a past campaign is a good candidate for buying in the next campaign (we could link that with customer sastifaction).

 We also notice a possible seasonality in the features as only certain specific months are selected (may, october, september) and the rest dropped. This could be because clients have higher spending at specific moments in the year and more money readily available at other times (during celebrations, after going on vacation).
 

### **Comments on the drop in performance using SelectKBest**


As mentioned above, we notice a significant drop in performance from using SelectKBest compared to the XGBoost model in the first assigment. This could be for several reasons. 

Firstly, SelectKBest is a univariate method. This means that it looks at each feature in isolation when deciding whether to keep it or not. XGBoost, on the contrary is multivariate and builds trees that find interactions between features. We could think of this in the following way: being married might not be significant on its own and the month of October might not be significant on its own again. However, interacting the two variables (a married customer in October) might display greater signifance. The idea here is to consider the link between the month of October (9 months after Valentine's day) and married couples whom therefore are more likely to expect a baby in that month and thus not subscribe to a term deposit as they will need all funds for their baby. This is just one example among many but what matters is that the SelectKBest method fails to see these correlations and thus drops potentially meaningful targets. 

We also have a problem of high cardinality in our model as we used the one-hot encoding method for our categorical variables. This means that we have many columns filled with 0s. It is possible that SelectKBest assigns a lower score to these sparse columns and gives them low statistical importance because of that. For example, even if job_student only applies to 2% of the data, that 2% might be highly predictable. XGBoost could have isolated that group while SelectKBest saw it as statistically insignificant noise and removed it. 

Finally, a last reason that could help explain the drop in performance is that XGBoost already has a sort of in-built feature selection process. If a feature is irrelevant, and adds noise, XGBoost will not include it in its splits. When using SelectKBest, we are essentially already cleaning the data and removing features considered as noise by the method. However, we are essentially removing options for the XGBoost model to make connections that SelectKBest could not have made and thus leading to potentially poorer performance. 




