Bank Marketing Data Set
The data is related with direct marketing campaigns of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).


In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix
from sklearn.pipeline import FeatureUnion

In [2]:
df = pd.read_csv("./bank.csv", ';')
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,no,-333,yes,no,cellular,30,jul,329,5,-1,0,unknown,no
4517,57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,9,may,153,1,-1,0,unknown,no
4518,57,technician,married,secondary,no,295,no,no,cellular,19,aug,151,11,-1,0,unknown,no
4519,28,blue-collar,married,secondary,no,1137,no,no,cellular,6,feb,129,4,211,3,other,no


In [3]:
df['y'].value_counts()

no     4000
yes     521
Name: y, dtype: int64

In [4]:
df.rename(columns={'y':'target'}, inplace=True)
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,target
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,no,-333,yes,no,cellular,30,jul,329,5,-1,0,unknown,no
4517,57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,9,may,153,1,-1,0,unknown,no
4518,57,technician,married,secondary,no,295,no,no,cellular,19,aug,151,11,-1,0,unknown,no
4519,28,blue-collar,married,secondary,no,1137,no,no,cellular,6,feb,129,4,211,3,other,no


In [5]:
df['target'] = df['target'].replace('no', 0, regex=True)
df['target'] = df['target'].replace('yes', 1, regex=True)

In [6]:
df.drop(['contact','day','month'], axis=1, inplace=True)
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,duration,campaign,pdays,previous,poutcome,target
0,30,unemployed,married,primary,no,1787,no,no,79,1,-1,0,unknown,0
1,33,services,married,secondary,no,4789,yes,yes,220,1,339,4,failure,0
2,35,management,single,tertiary,no,1350,yes,no,185,1,330,1,failure,0
3,30,management,married,tertiary,no,1476,yes,yes,199,4,-1,0,unknown,0
4,59,blue-collar,married,secondary,no,0,yes,no,226,1,-1,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,no,-333,yes,no,329,5,-1,0,unknown,0
4517,57,self-employed,married,tertiary,yes,-3313,yes,yes,153,1,-1,0,unknown,0
4518,57,technician,married,secondary,no,295,no,no,151,11,-1,0,unknown,0
4519,28,blue-collar,married,secondary,no,1137,no,no,129,4,211,3,other,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   duration   4521 non-null   int64 
 9   campaign   4521 non-null   int64 
 10  pdays      4521 non-null   int64 
 11  previous   4521 non-null   int64 
 12  poutcome   4521 non-null   object
 13  target     4521 non-null   int64 
dtypes: int64(7), object(7)
memory usage: 494.6+ KB


In [8]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['target']), df['target'], random_state=0)

In [9]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    

class NumberSelector(BaseEstimator, TransformerMixin):
   
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]


continuos_cols = ['age', 'balance', 'campaign','duration','pdays','previous']
cat_cols = ['job','marital','education', 'poutcome','default','housing','loan']


final_transformers = list()

for cat_col in cat_cols:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuos_cols:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    
    final_transformers.append((cont_col, cont_transformer))    


In [10]:
feats = FeatureUnion(final_transformers)

In [11]:
pipeline = Pipeline([
    ('features', feats),
    ('classifier', RandomForestClassifier(random_state=42)),
])
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('job',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='job')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='job'))])),
                                                ('marital',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='marital')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='marital'))])),
                                                ('education',
                                                 Pipeline(steps=[('selector',
                       

In [12]:
preds = pipeline.predict_proba(X_test)[:, 1]
preds[:10]

array([0.02, 0.05, 0.39, 0.04, 0.  , 0.3 , 0.01, 0.49, 0.08, 0.67])

In [13]:
metrics = pd.DataFrame(columns=['model', 'thresh', 'F-Score', 'Precision', 'Recall', 'ROC AUC'])


precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

roc_auc = roc_auc_score(y_test, preds)
roc_auc



Best Threshold=0.3, F-Score=0.572, Precision=0.546, Recall=0.601


0.8621765401287271

In [14]:
metrics = metrics.append({
    'model': 'RF',
    'thresh': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc
}, ignore_index=True)

metrics

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,RF,0.3,0.572414,0.546053,0.601449,0.862177


In [15]:
mod_data = X_train.copy()
mod_data['target'] = y_train
mod_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,duration,campaign,pdays,previous,poutcome,target
4384,52,management,single,tertiary,no,4,no,yes,30,9,-1,0,unknown,0
2560,44,blue-collar,married,secondary,no,1071,yes,no,143,3,357,1,failure,0
1470,55,blue-collar,married,primary,no,4103,no,no,291,1,-1,0,unknown,0
1771,37,management,married,tertiary,no,347,yes,no,282,1,-1,0,unknown,0
2604,57,management,married,unknown,no,0,no,no,585,1,-1,0,unknown,0


In [16]:
# get the indices of the positives samples
pos_ind = mod_data[mod_data['target'] == 1].sample(frac=1, random_state=42).index

# leave just 25% of the positives marked
perc = 0.25
pos_sample_len = int(np.ceil(perc * len(pos_ind)))

print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 96/383 as positives and unlabeling the rest


In [17]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample, 'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    3294
 1      96
Name: class_test, dtype: int64


In [18]:
mod_data = mod_data.sample(frac=1, random_state=42)


data_N = mod_data[mod_data['class_test'] == -1]
data_P = mod_data[mod_data['class_test'] == 1]

neg_sample = data_N[:data_P.shape[0]]
pos_sample = data_P.copy()

print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1, random_state=42)

(96, 15) (96, 15)


In [19]:
sample_train.loc[sample_train['class_test'] == -1, 'class_test'] = 0

pipeline = Pipeline([
    ('features', feats),
    ('classifier', RandomForestClassifier(random_state=42)),
])


pipeline.fit(sample_train.drop(columns=['class_test', 'target']), 
             sample_train['class_test'])

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('job',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='job')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='job'))])),
                                                ('marital',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='marital')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='marital'))])),
                                                ('education',
                                                 Pipeline(steps=[('selector',
                       

In [20]:
preds = pipeline.predict_proba(X_test)[:, 1]
preds[:10]

array([0.41, 0.54, 0.49, 0.09, 0.13, 0.5 , 0.2 , 0.66, 0.43, 0.73])

In [21]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.6, F-Score=0.457, Precision=0.387, Recall=0.558


In [22]:
roc_auc = roc_auc_score(y_test, preds)
roc_auc

0.8327750777179386

In [23]:
metrics = metrics.append({
    'model': 'PUL',
    'thresh': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc
}, ignore_index=True)

In [24]:
metrics

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,RF,0.3,0.572414,0.546053,0.601449,0.862177
1,PUL,0.6,0.456973,0.386935,0.557971,0.832775
