## Bank Marketing

Data Set Information:
The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the client will subscribe (yes/no) a term deposit (variable y).


Bank Marketing Data Set

Source: [Moro et al., 2014] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014




In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
bank = pd.read_excel('bank_dirty_500.xlsx')

In [5]:
df = pd.DataFrame(bank)

In [8]:
df.isnull().sum()

age                0
job                1
marital            3
education          7
default            5
housing            9
loan               7
contact           15
month             10
day_of_week        6
duration           8
campaign           4
pdays              5
previous           4
poutcome           5
emp.var.rate       4
cons.price.idx     6
cons.conf.idx      1
euribor3m          1
nr.employed        0
y                  0
dtype: int64

In [32]:
#https://www.kaggle.com/c/data-driven-decision-making-spring-2015/data
#removed duration
df.dropna(subset=['duration']).head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success,y_yes
0,0.844444,0.015393,0.0,1.0,0.166667,0.333333,0.26968,0.192469,0.14398,0.512287,...,0,0,0,0,0,1,0,0,0,0
1,0.222222,0.0011,0.727273,1.0,0.0,1.0,0.669135,0.338912,0.999538,1.0,...,0,0,0,0,1,0,0,1,0,0
2,0.222222,0.099505,0.045455,1.0,0.166667,0.083333,0.199532,0.74477,0.015253,0.226465,...,0,0,0,1,0,0,0,0,0,1
3,0.288889,0.004123,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.973423,0.859735,...,0,0,0,0,0,0,0,1,0,0
4,0.355556,0.0547,0.090909,1.0,0.0,1.0,0.669135,0.338912,0.998151,1.0,...,0,0,0,1,0,0,0,1,0,0


In [11]:
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success,y_yes
0,59,61.0,1.0,999.0,1.0,-1.8,92.893,-46.2,1.266,5099.1,...,0,0,0,0,0,1,0,0,0,0
1,31,9.0,17.0,999.0,0.0,1.4,93.918,-42.7,4.968,5228.1,...,0,0,0,0,1,0,0,1,0,0
2,31,367.0,2.0,999.0,1.0,-3.0,92.713,-33.0,0.709,5023.5,...,0,0,0,1,0,0,0,0,0,1
3,34,20.0,1.0,999.0,0.0,1.1,93.994,-36.4,4.855,5191.0,...,0,0,0,0,0,0,0,1,0,0
4,37,204.0,3.0,999.0,0.0,1.4,93.918,-42.7,4.962,5228.1,...,0,0,0,1,0,0,0,1,0,0


In [12]:
from sklearn.preprocessing import Imputer
#mean, median, most_frequent
imr = Imputer(missing_values='NaN', strategy='mean', axis=0) #axis =1 if imputed variable is operated within column
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

array([[ 59.,  61.,   1., ...,   0.,   0.,   0.],
       [ 31.,   9.,  17., ...,   1.,   0.,   0.],
       [ 31., 367.,   2., ...,   0.,   0.,   1.],
       ...,
       [ 60., 105.,   3., ...,   1.,   0.,   0.],
       [ 26., 200.,   2., ...,   1.,   0.,   0.],
       [ 37., 288.,   1., ...,   1.,   0.,   0.]])

In [31]:
df = df.dropna(axis=0)
df.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success,y_yes
0,0.844444,0.015393,0.0,1.0,0.166667,0.333333,0.26968,0.192469,0.14398,0.512287,...,0,0,0,0,0,1,0,0,0,0
1,0.222222,0.0011,0.727273,1.0,0.0,1.0,0.669135,0.338912,0.999538,1.0,...,0,0,0,0,1,0,0,1,0,0
2,0.222222,0.099505,0.045455,1.0,0.166667,0.083333,0.199532,0.74477,0.015253,0.226465,...,0,0,0,1,0,0,0,0,0,1
3,0.288889,0.004123,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.973423,0.859735,...,0,0,0,0,0,0,0,1,0,0
4,0.355556,0.0547,0.090909,1.0,0.0,1.0,0.669135,0.338912,0.998151,1.0,...,0,0,0,1,0,0,0,1,0,0


In [16]:
types = df.dtypes
list_of_float = list(types[types == 'float'].index)
list_of_int = list(types[types == 'int'].index)
list_of_float_int = list_of_int + list_of_float

df1 = df[list_of_float_int]
df1.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,59,61.0,1.0,999.0,1.0,-1.8,92.893,-46.2,1.266,5099.1
1,31,9.0,17.0,999.0,0.0,1.4,93.918,-42.7,4.968,5228.1
2,31,367.0,2.0,999.0,1.0,-3.0,92.713,-33.0,0.709,5023.5
3,34,20.0,1.0,999.0,0.0,1.1,93.994,-36.4,4.855,5191.0
4,37,204.0,3.0,999.0,0.0,1.4,93.918,-42.7,4.962,5228.1


In [17]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

mms = MinMaxScaler()
#stdsc = StandardScaler()

df1_norm = mms.fit_transform(df1)
#df1_std=stdsc.fit_transform(df1)
#print(df1)
print(df1_norm)
#print(df1_std)

[[0.84444444 0.01539307 0.         ... 0.19246862 0.14397966 0.51228733]
 [0.22222222 0.00109951 0.72727273 ... 0.33891213 0.99953779 1.        ]
 [0.22222222 0.09950522 0.04545455 ... 0.74476987 0.01525306 0.22646503]
 ...
 [0.35555556 0.08548653 0.04545455 ... 0.60251046 0.97388491 0.85973535]
 [0.11111111 0.05360088 0.04545455 ... 0.33891213 0.99768893 1.        ]
 [0.35555556 0.07778999 0.         ... 0.37656904 0.99792004 1.        ]]


In [19]:
df[list_of_float_int] = df1_norm
df.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success,y_yes
0,0.844444,0.015393,0.0,1.0,0.166667,0.333333,0.26968,0.192469,0.14398,0.512287,...,0,0,0,0,0,1,0,0,0,0
1,0.222222,0.0011,0.727273,1.0,0.0,1.0,0.669135,0.338912,0.999538,1.0,...,0,0,0,0,1,0,0,1,0,0
2,0.222222,0.099505,0.045455,1.0,0.166667,0.083333,0.199532,0.74477,0.015253,0.226465,...,0,0,0,1,0,0,0,0,0,1
3,0.288889,0.004123,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.973423,0.859735,...,0,0,0,0,0,0,0,1,0,0
4,0.355556,0.0547,0.090909,1.0,0.0,1.0,0.669135,0.338912,0.998151,1.0,...,0,0,0,1,0,0,0,1,0,0


In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

Number_trials=50

def train_knn(X, y):
    score_train = []
    score_test = []

    for seed in range(Number_trials):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

        neighbors_settings = range(1,45)
        acc_train = []
        acc_test = []

        for n_neighbors in neighbors_settings:   
            clf = KNeighborsClassifier(n_neighbors=n_neighbors) # build the model 
            clf.fit(X_train, y_train)    
            acc_train.append(clf.score(X_train, y_train))
            acc_test.append(clf.score(X_test, y_test))

        score_train.append(acc_train)
        score_test.append(acc_test)   
        
    score = np.mean(score_test, axis=0)
    #return n_neighbor, accuracy
    return ['kNN', np.amax(score), 'N_Neighbor = {0}'.format(np.argmax(score)+1), 'NA']

def train_logistic(X, y, reg):
    C = [1e-8, 1e-4, 1e-3, 1e-2, 0.1, 0.2,0.4, 0.75, 1, 1.5, 3, 5, 10, 15,  20, 100, 300, 1000, 5000]
    #C = [0.01]

    score_train = []
    score_test = []
    weighted_coefs=[]
    
    for seed in range(Number_trials):
        training_accuracy = []  
        test_accuracy = []
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)
        for alpha_run in C:
            lr = LogisticRegression(C=alpha_run, penalty=reg).fit(X_train, y_train)
            training_accuracy.append(lr.score(X_train, y_train))
            test_accuracy.append(lr.score(X_test, y_test))
            if alpha_run == 0.01:
                coefs=lr.coef_ 
                weighted_coefs.append(coefs) #append all the computed coefficients per trial
                
        score_train.append(training_accuracy)
        score_test.append(test_accuracy)
    
    mean_coefs=np.mean(weighted_coefs, axis=0) #get the mean of the weighted coefficients over all the trials 
    #print(mean_coefs) 
    score = np.mean(score_test, axis=0)
    #return score.shape
    #coefs = lr.coef_
    #return C value, accuracy, column/feature name
    
    if scaler == 'ON':
        top_predictor = 'DEBUGGING'
    if scaler == 'OFF':
        top_predictor=X.columns[np.argmax(np.abs(mean_coefs))]
            
    return ['Logistic ({0})'.format(reg), np.amax(score), \
            'C = {0}'.format(C[np.argmax(score)]), top_predictor]

def train_svm(X, y, reg):
    C = [1e-8, 1e-4, 1e-3, 1e-2, 0.1, 0.2,0.4, 0.75, 1, 1.5, 3, 5, 10, 15,  20, 100, 300, 1000, 5000]
    #C = [0.01]
    
    score_train = []
    score_test = []
    weighted_coefs=[]
    
    for seed in range(Number_trials):
        training_accuracy = []  
        test_accuracy = []
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)
        for alpha_run in C:
            if reg == 'l1':
                svc = LinearSVC(C=alpha_run, penalty=reg, loss='squared_hinge', dual=False).fit(X_train, y_train)
            if reg == 'l2':
                svc = LinearSVC(C=alpha_run, penalty=reg).fit(X_train, y_train)
            training_accuracy.append(svc.score(X_train, y_train))
            test_accuracy.append(svc.score(X_test, y_test))
            if alpha_run == 0.01:
                coefs = svc.coef_
                weighted_coefs.append(coefs)
                
        score_train.append(training_accuracy)
        score_test.append(test_accuracy)
 

    mean_coefs=np.mean(weighted_coefs, axis=0) #get the mean of the weighted coefficients over all the trials       
    score = np.mean(score_test, axis=0)
    
    if scaler == 'ON':
        top_predictor = 'DEBUGGING'
    if scaler == 'OFF':
        top_predictor=X.columns[np.argmax(np.abs(mean_coefs))]
    #return score
    #return C value, accuracy, column/feature name
    return ['Linear SVM ({0})'.format(reg), np.amax(score), \
            'C = {0}'.format(C[np.argmax(score)]), top_predictor]

In [21]:
# %%time
scaler='OFF'
#X = df.drop[]
X = df.drop(['y_yes'], axis=1)
y = df['y_yes']

a = train_knn(X,y)
b = train_logistic(X,y,reg='l2')
c = train_logistic(X,y,reg='l1')
d = train_svm(X,y,reg='l2')
e = train_svm(X,y,reg='l1')
print(a)
print(b)
print(c)
print(d)
print(e)

['kNN', 0.8830508474576271, 'N_Neighbor = 11', 'NA']
['Logistic (l2)', 0.8928813559322033, 'C = 100', 'pdays']
['Logistic (l1)', 0.9057627118644064, 'C = 1.5', 'pdays']
['Linear SVM (l2)', 0.8930508474576273, 'C = 5', 'pdays']
['Linear SVM (l1)', 0.9033898305084749, 'C = 0.4', 'pdays']


In [22]:
cols = ['Machine Learning Method', 'Test Accuracy', 'Best Parameter', 'Top Predictor Variable']
df2 = pd.DataFrame(columns=cols)

df2.loc[0] = a
df2.loc[1] = b
df2.loc[2] = c
df2.loc[3] = d
df2.loc[4] = e

df2

Unnamed: 0,Machine Learning Method,Test Accuracy,Best Parameter,Top Predictor Variable
0,kNN,0.883051,N_Neighbor = 11,
1,Logistic (l2),0.892881,C = 100,pdays
2,Logistic (l1),0.905763,C = 1.5,pdays
3,Linear SVM (l2),0.893051,C = 5,pdays
4,Linear SVM (l1),0.90339,C = 0.4,pdays


Logistic L1 at 90.57% test accuracy.