#### COGS 118 Project - Bank Marketing

In [108]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns


In [109]:
Bank_Marketing = pd.read_csv('bank-full.csv', sep=";")

### Data Exploration

In [110]:
Bank_Marketing

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [111]:
# Check Null Values
print(Bank_Marketing.isnull().sum())

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [112]:
# Check data types
print(Bank_Marketing.dtypes)

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object


## Convert our Output Variable to Binary

In [113]:
# Convert target variable to binary
Bank_Marketing['y'] = Bank_Marketing['y'].map({'yes': 1, 'no': 0})

In [114]:
Bank_Marketing['y']

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int64

## Select Relavant columns

In [115]:
Bank_Marketing.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [129]:
columns = ['age','housing','education','balance','loan','duration','y']
Bank = Bank_Marketing[columns]

In [130]:
Bank

Unnamed: 0,age,housing,education,balance,loan,duration,y
0,58,yes,tertiary,2143,no,261,0
1,44,yes,secondary,29,no,151,0
2,33,yes,secondary,2,yes,76,0
3,47,yes,unknown,1506,no,92,0
4,33,no,unknown,1,no,198,0
...,...,...,...,...,...,...,...
45206,51,no,tertiary,825,no,977,1
45207,71,no,primary,1729,no,456,1
45208,72,no,secondary,5715,no,1127,1
45209,57,no,secondary,668,no,508,0


In [131]:
Bank['housing'] = Bank['housing'].map({'yes': 1, 'no': 0, 'unknown': np.nan})
Bank['loan'] = Bank['loan'].map({'yes': 1, 'no': 0, 'unknown': np.nan})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Bank['housing'] = Bank['housing'].map({'yes': 1, 'no': 0, 'unknown': np.nan})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Bank['loan'] = Bank['loan'].map({'yes': 1, 'no': 0, 'unknown': np.nan})


In [132]:
Bank = Bank.replace('unknown',np.nan)
Bank = Bank.replace('primary',0)
Bank = Bank.replace('secondary',0.5)
Bank = Bank.replace('tertiary',1)


In [137]:
rows_b = Bank.shape[0]
Bank = Bank.dropna()
rows_a = Bank.shape[0]
print(rows_b,rows_a)

45211 43354


In [138]:
Bank

Unnamed: 0,age,housing,education,balance,loan,duration,y
0,58,1.0,1.0,2143,0.0,261,0
1,44,1.0,0.5,29,0.0,151,0
2,33,1.0,0.5,2,1.0,76,0
5,35,1.0,1.0,231,0.0,139,0
6,28,1.0,1.0,447,1.0,217,0
...,...,...,...,...,...,...,...
45206,51,0.0,1.0,825,0.0,977,1
45207,71,0.0,0.0,1729,0.0,456,1
45208,72,0.0,0.5,5715,0.0,1127,1
45209,57,0.0,0.5,668,0.0,508,0
