In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [2]:
df = pd.read_csv('data/bank_marketing.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  campaign        41188 non-null  int64  
 11  previous        41188 non-null  int64  
 12  poutcome        41188 non-null  object 
 13  cons.price.idx  41188 non-null  float64
 14  cons.conf.idx   41188 non-null  float64
 15  euribor3m       41188 non-null  float64
 16  y               41188 non-null  object 
dtypes: float64(3), int64(3), object

In [4]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,previous,poutcome,cons.price.idx,cons.conf.idx,euribor3m,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,1,0,nonexistent,93.994,-36.4,4.857,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,1,0,nonexistent,93.994,-36.4,4.857,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,1,0,nonexistent,93.994,-36.4,4.857,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,1,0,nonexistent,93.994,-36.4,4.857,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,1,0,nonexistent,93.994,-36.4,4.857,no


#### Renaming ugly values

In [5]:
df.rename(columns={'cons.price.idx':'cons_price_idx', 'cons.conf.idx':'cons_conf_idx'}, inplace=True)
df['education'] = df['education'].map(lambda x: x.replace('.','_'))
df['job'] = df['job'].map(lambda x: x.replace('.',""))

#### extract to csv(for dash)

In [6]:
df.to_csv('data/fixed_names_df.csv')

In [7]:
df[df['y'] == 'yes'].shape[0]

4640

In [8]:
df[df['y'] == 'yes'].shape[0] / df.shape[0]

0.11265417111780131

In [9]:
df['y'] = df['y'].map(lambda x: 1 if x=='yes' else 0)

In [10]:
X = df.loc[:,df.columns != 'y']
y = df['y']

In [11]:
X = pd.get_dummies(X)

In [12]:
X.head()

Unnamed: 0,age,campaign,previous,cons_price_idx,cons_conf_idx,euribor3m,job_admin,job_blue-collar,job_entrepreneur,job_housemaid,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,56,1,0,93.994,-36.4,4.857,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
1,57,1,0,93.994,-36.4,4.857,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,37,1,0,93.994,-36.4,4.857,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,40,1,0,93.994,-36.4,4.857,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,56,1,0,93.994,-36.4,4.857,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2)

In [14]:
sum(y_train == 1) / len(y_train)

0.11274658573596358

In [15]:
sum(y_test == 1) / len(y_test)

0.11228453508133042

#### Baseline model with default values for GradientBoostingClassifier

In [16]:
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train, y_train)
y_score_gb = gb_clf.predict_proba(X_test)[:,1]
auc = metrics.roc_auc_score(y_test, y_score_gb)
print(f"AUC for baseline model: {auc}")

AUC for baseline model: 0.804908992131746


#### Naive try of fine tuning the parameters

In [18]:
learning_rates = [1, 0.5, 0.25, 0.1, 0.05, 0.01]
n_estimators = [16, 32, 64, 128, 256, 512]
max_depth = [2, 3, 4, 5, 6, 7]
for eta in learning_rates:
    for n in n_estimators:
        for depth in max_depth:
            gb_clf = GradientBoostingClassifier(learning_rate=eta, n_estimators=n, max_depth=depth)
            gb_clf.fit(X_train, y_train)
            y_score_gb = gb_clf.predict_proba(X_test)[:,1]
            auc = metrics.roc_auc_score(y_test, y_score_gb)
            print(f"AUC for learning rate={eta}, n estimators={n}, max depth={depth} model: {auc}")

AUC for learning rate=1, n estimators=16, max depth=2 model: 0.7885880383323293
AUC for learning rate=1, n estimators=16, max depth=3 model: 0.8022610604587904
AUC for learning rate=1, n estimators=16, max depth=4 model: 0.7860216349263252
AUC for learning rate=1, n estimators=16, max depth=5 model: 0.7761373488899812
AUC for learning rate=1, n estimators=16, max depth=6 model: 0.772705474515949
AUC for learning rate=1, n estimators=16, max depth=7 model: 0.7546767880967252
AUC for learning rate=1, n estimators=32, max depth=2 model: 0.7893474412467986
AUC for learning rate=1, n estimators=32, max depth=3 model: 0.7984491889674441
AUC for learning rate=1, n estimators=32, max depth=4 model: 0.7874547732471977
AUC for learning rate=1, n estimators=32, max depth=5 model: 0.7735909025393506
AUC for learning rate=1, n estimators=32, max depth=6 model: 0.7372871709395707
AUC for learning rate=1, n estimators=32, max depth=7 model: 0.7305460472095232
AUC for learning rate=1, n estimators=64,

#### The best one is learning rate=0.05, n estimators=128, max depth=6

In [21]:
gb_clf = GradientBoostingClassifier(learning_rate=0.05, n_estimators=128, max_depth=6)
gb_clf.fit(X_train, y_train)
y_score_gb = gb_clf.predict_proba(X_test)[:,1]
auc = metrics.roc_auc_score(y_test, y_score_gb)
print(f"AUC for final model: {auc}")

AUC for final model: 0.8122002653549214


#### Baseline model with default values for RandomForestClassifier

In [23]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train,y_train)
y_score_rf = rf_clf.predict_proba(X_test)[:,1]
auc = metrics.roc_auc_score(y_test, y_score_rf)
print(f"AUC for baseline model: {auc}")

AUC for baseline model: 0.7784619762658872


#### the baseline model of GB is better than the baseline model of RF, so lets choose the GB model

##### JOBLIB

In [24]:
import joblib

In [25]:
joblib.dump(gb_clf, 'GradientBoostingClassifier')

['GradientBoostingClassifier']