In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text
from sklearn.metrics import f1_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
%matplotlib inline

In [2]:
df = pd.read_csv('bank-full.csv',sep=';')
print("\nNumber of samples:",df.shape[0],"and number of features:",df.shape[1],"\n")
# read the data and display the first 5 rows
#df.head()


Number of samples: 45211 and number of features: 17 



In [3]:
df_test = pd.read_csv('bank.csv',sep=';')
print("\nNumber of samples:",df_test.shape[0],"and number of features:",df_test.shape[1],"\n")
# read the data and display the first 5 rows
df_test.head()


Number of samples: 4521 and number of features: 17 



Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [4]:
df.y = (df.y == 'yes').astype(int) #changing our output (y) from categorical to numerical
df_test.y = (df_test.y == 'yes').astype(int) #changing our output (y) from categorical to numerical

In [5]:
df['balance'] = df['balance'].mask(df['balance'] < 0, 0)
balance_logs = np.log1p(df.balance)
df['balance_logs'] = balance_logs




df_test["balance"].replace({-1: 0}, inplace=True) 
balance_logs = np.log1p(df_test.balance)
df_test['balance_logs'] = balance_logs
df_test.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,balance_logs
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,0,7.488853
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,0,8.474286
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,0,7.2086
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,0,7.297768
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,0,0.0


In [6]:
del df['balance']# we delete balance and replacing it with balance_logs because it will do better in the model.
df=df[['age', 'job', 'marital', 'education', 'default', 'balance_logs', 'housing', 'loan',
       'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous',
       'poutcome', 'y']]# reordering DataFrame


del df_test['balance']# we delete balance and replacing it with balance_logs because it will do better in the model.
df_test=df_test[['age', 'job', 'marital', 'education', 'default', 'balance_logs', 'housing', 'loan',
       'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous',
       'poutcome', 'y']]# reordering DataFrame
df_test.head()

Unnamed: 0,age,job,marital,education,default,balance_logs,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,7.488853,no,no,cellular,19,oct,79,1,-1,0,unknown,0
1,33,services,married,secondary,no,8.474286,yes,yes,cellular,11,may,220,1,339,4,failure,0
2,35,management,single,tertiary,no,7.2086,yes,no,cellular,16,apr,185,1,330,1,failure,0
3,30,management,married,tertiary,no,7.297768,yes,yes,unknown,3,jun,199,4,-1,0,unknown,0
4,59,blue-collar,married,secondary,no,0.0,yes,no,unknown,5,may,226,1,-1,0,unknown,0


In [7]:
cols = ['age', 'job', 'marital', 'education', 'default', 'balance_logs', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome']

In [8]:
y_train=df.y.values
y_test= df_test.y.values

In [9]:
from sklearn.feature_extraction import DictVectorizer
train_dict = df[cols].fillna(0).to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

X_train = dv.transform(train_dict)

In [10]:
test_dict = df_test[cols].fillna(0).to_dict(orient='records')
X_test = dv.transform(test_dict)

In [12]:
features = dv.get_feature_names()
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_test, label=y_test, feature_names=features)

In [13]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [14]:
#%%capture output

xgb_params = {
    'eta': 0.1, 
    'max_depth': 5,
    'min_child_weight': 30,
    
    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=200,
                  verbose_eval=5,
                  evals=watchlist)

[0]	train-auc:0.86276	val-auc:0.85808
[5]	train-auc:0.87928	val-auc:0.88056
[10]	train-auc:0.90485	val-auc:0.89506
[15]	train-auc:0.90954	val-auc:0.90323
[20]	train-auc:0.91277	val-auc:0.90689
[25]	train-auc:0.91705	val-auc:0.91063
[30]	train-auc:0.91974	val-auc:0.91339
[35]	train-auc:0.92245	val-auc:0.91563
[40]	train-auc:0.92449	val-auc:0.91733
[45]	train-auc:0.92679	val-auc:0.91948
[50]	train-auc:0.92887	val-auc:0.92143
[55]	train-auc:0.93060	val-auc:0.92335
[60]	train-auc:0.93306	val-auc:0.92626
[65]	train-auc:0.93469	val-auc:0.92839
[70]	train-auc:0.93595	val-auc:0.93012
[75]	train-auc:0.93686	val-auc:0.93110
[80]	train-auc:0.93784	val-auc:0.93230
[85]	train-auc:0.93841	val-auc:0.93295
[90]	train-auc:0.93938	val-auc:0.93424
[95]	train-auc:0.94008	val-auc:0.93490
[100]	train-auc:0.94075	val-auc:0.93557
[105]	train-auc:0.94169	val-auc:0.93670
[110]	train-auc:0.94228	val-auc:0.93741
[115]	train-auc:0.94266	val-auc:0.93800
[120]	train-auc:0.94304	val-auc:0.93837
[125]	train-auc:0.9433

In [15]:
y_pred1 = model.predict(dval)
y_pred2 = model.predict(dtrain)

auc1=roc_auc_score(y_test, y_pred1)
auc2=roc_auc_score(y_train, y_pred2)

print ('auc_val = %.4f\t, auc_train = %.4f' %(auc1, auc2))



auc_val = 0.9450	, auc_train = 0.9484


In [16]:
import pickle

In [17]:
output_file = 'model_1.bin'

In [18]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)

Loading the model

In [19]:
import pickle
import xgboost as xgb
import numpy as np

In [20]:
model_file = 'model_1.bin'

In [21]:
with open (model_file, 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [22]:
dv, model

(DictVectorizer(sparse=False), <xgboost.core.Booster at 0x23fa4f11220>)

In [23]:
customer = {
    'age' : 30,
    'job' : 'unemployed',
    'martial' : 'married',
    'education' : 'primary',
    'default' : 'no',
    'balance_logs' : np.log1p(3.25),
    'housing' : 'no',
    'loan' : 'no',
    'contact' : 'cellular',
    'day' : 19,
    'month' : 'oct',
    'duration' : 79,
    'campaign' : 1,
    'pdays' : -1,
    'previous' : 0,
    'poutcome' : 'unknown'    
}

In [24]:
X=dv.transform([customer])

In [25]:
x=xgb.DMatrix(X, label=([0]), feature_names=dv.get_feature_names())

In [26]:
pred=model.predict(x)

In [27]:
pred[0] 

0.046055023