In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
df = pd.read_csv(os.path.join('data', 'train_set.csv'))
df

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,1,43,management,married,tertiary,no,291,yes,no,unknown,9,may,150,2,-1,0,unknown,0
1,2,42,technician,divorced,primary,no,5076,yes,no,cellular,7,apr,99,1,251,2,other,0
2,3,47,admin.,married,secondary,no,104,yes,yes,cellular,14,jul,77,2,-1,0,unknown,0
3,4,28,management,single,secondary,no,-994,yes,yes,cellular,18,jul,174,2,-1,0,unknown,0
4,5,42,technician,divorced,secondary,no,2974,yes,no,unknown,21,may,187,5,-1,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25312,25313,55,blue-collar,divorced,primary,no,8180,no,no,cellular,14,may,854,2,360,1,failure,1
25313,25314,52,services,married,secondary,no,961,no,yes,cellular,18,feb,222,1,553,4,failure,1
25314,25315,35,blue-collar,divorced,primary,no,300,yes,no,unknown,13,may,945,2,-1,0,unknown,1
25315,25316,37,entrepreneur,divorced,tertiary,no,66,no,no,cellular,18,nov,1164,2,-1,0,unknown,1


In [3]:
# 归一化
def normalized(data, df, col):
    max_number = df[col].max()
    min_number = df[col].min()

    data[col] = df[col].map(lambda x: float(x - min_number) / float(max_number - min_number))

In [4]:
# 替换
def replace(data, df, col, d):
    data[col] = df[col].map(lambda x: d[x])

In [5]:
# one-hot
def one_hot(data, df, col):
    one_hot_pd = pd.get_dummies(df[col])
    one_hot_pd.columns = [col + str(i) for i in range(len(set(df[col])))]
    data = pd.concat([data, one_hot_pd], axis=1)
    return data

In [6]:
data = pd.DataFrame()
data

In [8]:
normalized(data, df, 'age')
data

Unnamed: 0,age
0,0.324675
1,0.311688
2,0.376623
3,0.129870
4,0.311688
...,...
25312,0.480519
25313,0.441558
25314,0.220779
25315,0.246753


In [9]:
data = one_hot(data, df, 'job')
data

In [10]:
marital_dict = {'single': -1, 'married': 0, 'divorced': 1}
replace(data, df, 'marital', marital_dict)
data

Unnamed: 0,age,job0,job1,job2,job3,job4,job5,job6,job7,job8,job9,job10,job11,marital
0,0.324675,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0.311688,0,0,0,0,0,0,0,0,0,1,0,0,1
2,0.376623,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0.129870,0,0,0,0,1,0,0,0,0,0,0,0,-1
4,0.311688,0,0,0,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25312,0.480519,0,1,0,0,0,0,0,0,0,0,0,0,1
25313,0.441558,0,0,0,0,0,0,0,1,0,0,0,0,0
25314,0.220779,0,1,0,0,0,0,0,0,0,0,0,0,1
25315,0.246753,0,0,1,0,0,0,0,0,0,0,0,0,1


In [11]:
education_dict = {'unknown': 0, 'primary': 1, 'secondary': 2, 'tertiary': 3}
replace(data, df, 'education', education_dict)
data

Unnamed: 0,age,job0,job1,job2,job3,job4,job5,job6,job7,job8,job9,job10,job11,marital,education
0,0.324675,0,0,0,0,1,0,0,0,0,0,0,0,0,3
1,0.311688,0,0,0,0,0,0,0,0,0,1,0,0,1,1
2,0.376623,1,0,0,0,0,0,0,0,0,0,0,0,0,2
3,0.129870,0,0,0,0,1,0,0,0,0,0,0,0,-1,2
4,0.311688,0,0,0,0,0,0,0,0,0,1,0,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25312,0.480519,0,1,0,0,0,0,0,0,0,0,0,0,1,1
25313,0.441558,0,0,0,0,0,0,0,1,0,0,0,0,0,2
25314,0.220779,0,1,0,0,0,0,0,0,0,0,0,0,1,1
25315,0.246753,0,0,1,0,0,0,0,0,0,0,0,0,1,3


In [12]:
default_dict = {'no': 0, 'yes': 1}
replace(data, df, 'default', default_dict)
data

Unnamed: 0,age,job0,job1,job2,job3,job4,job5,job6,job7,job8,job9,job10,job11,marital,education,default
0,0.324675,0,0,0,0,1,0,0,0,0,0,0,0,0,3,0
1,0.311688,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0
2,0.376623,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0
3,0.129870,0,0,0,0,1,0,0,0,0,0,0,0,-1,2,0
4,0.311688,0,0,0,0,0,0,0,0,0,1,0,0,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25312,0.480519,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0
25313,0.441558,0,0,0,0,0,0,0,1,0,0,0,0,0,2,0
25314,0.220779,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0
25315,0.246753,0,0,1,0,0,0,0,0,0,0,0,0,1,3,0


In [13]:
normalized(data, df, 'balance')
data

Unnamed: 0,age,job0,job1,job2,job3,job4,job5,job6,job7,job8,job9,job10,job11,marital,education,default,balance
0,0.324675,0,0,0,0,1,0,0,0,0,0,0,0,0,3,0,0.075445
1,0.311688,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0.118888
2,0.376623,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0.073748
3,0.129870,0,0,0,0,1,0,0,0,0,0,0,0,-1,2,0,0.063779
4,0.311688,0,0,0,0,0,0,0,0,0,1,0,0,1,2,0,0.099804
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25312,0.480519,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0.147068
25313,0.441558,0,0,0,0,0,0,0,1,0,0,0,0,0,2,0,0.081528
25314,0.220779,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0.075527
25315,0.246753,0,0,1,0,0,0,0,0,0,0,0,0,1,3,0,0.073403


In [14]:
housing_dict = {'no': 0, 'yes': 1}
replace(data, df, 'housing', housing_dict)
data

Unnamed: 0,age,job0,job1,job2,job3,job4,job5,job6,job7,job8,job9,job10,job11,marital,education,default,balance,housing
0,0.324675,0,0,0,0,1,0,0,0,0,0,0,0,0,3,0,0.075445,1
1,0.311688,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0.118888,1
2,0.376623,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0.073748,1
3,0.129870,0,0,0,0,1,0,0,0,0,0,0,0,-1,2,0,0.063779,1
4,0.311688,0,0,0,0,0,0,0,0,0,1,0,0,1,2,0,0.099804,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25312,0.480519,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0.147068,0
25313,0.441558,0,0,0,0,0,0,0,1,0,0,0,0,0,2,0,0.081528,0
25314,0.220779,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0.075527,1
25315,0.246753,0,0,1,0,0,0,0,0,0,0,0,0,1,3,0,0.073403,0


In [15]:
loan_dict = {'no': 0, 'yes': 1}
replace(data, df, 'loan', loan_dict)
data

Unnamed: 0,age,job0,job1,job2,job3,job4,job5,job6,job7,job8,job9,job10,job11,marital,education,default,balance,housing,loan
0,0.324675,0,0,0,0,1,0,0,0,0,0,0,0,0,3,0,0.075445,1,0
1,0.311688,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0.118888,1,0
2,0.376623,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0.073748,1,1
3,0.129870,0,0,0,0,1,0,0,0,0,0,0,0,-1,2,0,0.063779,1,1
4,0.311688,0,0,0,0,0,0,0,0,0,1,0,0,1,2,0,0.099804,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25312,0.480519,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0.147068,0,0
25313,0.441558,0,0,0,0,0,0,0,1,0,0,0,0,0,2,0,0.081528,0,1
25314,0.220779,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0.075527,1,0
25315,0.246753,0,0,1,0,0,0,0,0,0,0,0,0,1,3,0,0.073403,0,0


In [16]:
contact_dict = {'unknown': -1, 'cellular': 0, 'telephone': 1}
replace(data, df, 'contact', contact_dict)
data

Unnamed: 0,age,job0,job1,job2,job3,job4,job5,job6,job7,job8,job9,job10,job11,marital,education,default,balance,housing,loan,contact
0,0.324675,0,0,0,0,1,0,0,0,0,0,0,0,0,3,0,0.075445,1,0,-1
1,0.311688,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0.118888,1,0,0
2,0.376623,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0.073748,1,1,0
3,0.129870,0,0,0,0,1,0,0,0,0,0,0,0,-1,2,0,0.063779,1,1,0
4,0.311688,0,0,0,0,0,0,0,0,0,1,0,0,1,2,0,0.099804,1,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25312,0.480519,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0.147068,0,0,0
25313,0.441558,0,0,0,0,0,0,0,1,0,0,0,0,0,2,0,0.081528,0,1,0
25314,0.220779,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0.075527,1,0,-1
25315,0.246753,0,0,1,0,0,0,0,0,0,0,0,0,1,3,0,0.073403,0,0,0


In [17]:
normalized(data, df, 'day')
data

Unnamed: 0,age,job0,job1,job2,job3,job4,job5,job6,job7,job8,...,job10,job11,marital,education,default,balance,housing,loan,contact,day
0,0.324675,0,0,0,0,1,0,0,0,0,...,0,0,0,3,0,0.075445,1,0,-1,0.266667
1,0.311688,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0.118888,1,0,0,0.200000
2,0.376623,1,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0.073748,1,1,0,0.433333
3,0.129870,0,0,0,0,1,0,0,0,0,...,0,0,-1,2,0,0.063779,1,1,0,0.566667
4,0.311688,0,0,0,0,0,0,0,0,0,...,0,0,1,2,0,0.099804,1,0,-1,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25312,0.480519,0,1,0,0,0,0,0,0,0,...,0,0,1,1,0,0.147068,0,0,0,0.433333
25313,0.441558,0,0,0,0,0,0,0,1,0,...,0,0,0,2,0,0.081528,0,1,0,0.566667
25314,0.220779,0,1,0,0,0,0,0,0,0,...,0,0,1,1,0,0.075527,1,0,-1,0.400000
25315,0.246753,0,0,1,0,0,0,0,0,0,...,0,0,1,3,0,0.073403,0,0,0,0.566667


In [19]:
month_dict = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
             'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
replace(data, df, 'month', month_dict)
data

Unnamed: 0,age,job0,job1,job2,job3,job4,job5,job6,job7,job8,...,job11,marital,education,default,balance,housing,loan,contact,day,month
0,0.324675,0,0,0,0,1,0,0,0,0,...,0,0,3,0,0.075445,1,0,-1,0.266667,5
1,0.311688,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0.118888,1,0,0,0.200000,4
2,0.376623,1,0,0,0,0,0,0,0,0,...,0,0,2,0,0.073748,1,1,0,0.433333,7
3,0.129870,0,0,0,0,1,0,0,0,0,...,0,-1,2,0,0.063779,1,1,0,0.566667,7
4,0.311688,0,0,0,0,0,0,0,0,0,...,0,1,2,0,0.099804,1,0,-1,0.666667,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25312,0.480519,0,1,0,0,0,0,0,0,0,...,0,1,1,0,0.147068,0,0,0,0.433333,5
25313,0.441558,0,0,0,0,0,0,0,1,0,...,0,0,2,0,0.081528,0,1,0,0.566667,2
25314,0.220779,0,1,0,0,0,0,0,0,0,...,0,1,1,0,0.075527,1,0,-1,0.400000,5
25315,0.246753,0,0,1,0,0,0,0,0,0,...,0,1,3,0,0.073403,0,0,0,0.566667,11


In [20]:
normalized(data, df, 'duration')
data

In [22]:
normalized(data, df, 'campaign')
data

Unnamed: 0,age,job0,job1,job2,job3,job4,job5,job6,job7,job8,...,education,default,balance,housing,loan,contact,day,month,duration,campaign
0,0.324675,0,0,0,0,1,0,0,0,0,...,3,0,0.075445,1,0,-1,0.266667,5,0.038650,0.018519
1,0.311688,0,0,0,0,0,0,0,0,0,...,1,0,0.118888,1,0,0,0.200000,4,0.025509,0.000000
2,0.376623,1,0,0,0,0,0,0,0,0,...,2,0,0.073748,1,1,0,0.433333,7,0.019840,0.018519
3,0.129870,0,0,0,0,1,0,0,0,0,...,2,0,0.063779,1,1,0,0.566667,7,0.044834,0.018519
4,0.311688,0,0,0,0,0,0,0,0,0,...,2,0,0.099804,1,0,-1,0.666667,5,0.048183,0.074074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25312,0.480519,0,1,0,0,0,0,0,0,0,...,1,0,0.147068,0,0,0,0.433333,5,0.220046,0.018519
25313,0.441558,0,0,0,0,0,0,0,1,0,...,2,0,0.081528,0,1,0,0.566667,2,0.057202,0.000000
25314,0.220779,0,1,0,0,0,0,0,0,0,...,1,0,0.075527,1,0,-1,0.400000,5,0.243494,0.018519
25315,0.246753,0,0,1,0,0,0,0,0,0,...,3,0,0.073403,0,0,0,0.566667,11,0.299923,0.018519


In [25]:
normalized(data, df, 'pdays')
data

Unnamed: 0,age,job0,job1,job2,job3,job4,job5,job6,job7,job8,...,default,balance,housing,loan,contact,day,month,duration,campaign,pdays
0,0.324675,0,0,0,0,1,0,0,0,0,...,0,0.075445,1,0,-1,0.266667,5,0.038650,0.018519,0.000000
1,0.311688,0,0,0,0,0,0,0,0,0,...,0,0.118888,1,0,0,0.200000,4,0.025509,0.000000,0.294737
2,0.376623,1,0,0,0,0,0,0,0,0,...,0,0.073748,1,1,0,0.433333,7,0.019840,0.018519,0.000000
3,0.129870,0,0,0,0,1,0,0,0,0,...,0,0.063779,1,1,0,0.566667,7,0.044834,0.018519,0.000000
4,0.311688,0,0,0,0,0,0,0,0,0,...,0,0.099804,1,0,-1,0.666667,5,0.048183,0.074074,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25312,0.480519,0,1,0,0,0,0,0,0,0,...,0,0.147068,0,0,0,0.433333,5,0.220046,0.018519,0.422222
25313,0.441558,0,0,0,0,0,0,0,1,0,...,0,0.081528,0,1,0,0.566667,2,0.057202,0.000000,0.647953
25314,0.220779,0,1,0,0,0,0,0,0,0,...,0,0.075527,1,0,-1,0.400000,5,0.243494,0.018519,0.000000
25315,0.246753,0,0,1,0,0,0,0,0,0,...,0,0.073403,0,0,0,0.566667,11,0.299923,0.018519,0.000000


In [26]:
normalized(data, df, 'previous')
data

Unnamed: 0,age,job0,job1,job2,job3,job4,job5,job6,job7,job8,...,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous
0,0.324675,0,0,0,0,1,0,0,0,0,...,0.075445,1,0,-1,0.266667,5,0.038650,0.018519,0.000000,0.000000
1,0.311688,0,0,0,0,0,0,0,0,0,...,0.118888,1,0,0,0.200000,4,0.025509,0.000000,0.294737,0.007273
2,0.376623,1,0,0,0,0,0,0,0,0,...,0.073748,1,1,0,0.433333,7,0.019840,0.018519,0.000000,0.000000
3,0.129870,0,0,0,0,1,0,0,0,0,...,0.063779,1,1,0,0.566667,7,0.044834,0.018519,0.000000,0.000000
4,0.311688,0,0,0,0,0,0,0,0,0,...,0.099804,1,0,-1,0.666667,5,0.048183,0.074074,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25312,0.480519,0,1,0,0,0,0,0,0,0,...,0.147068,0,0,0,0.433333,5,0.220046,0.018519,0.422222,0.003636
25313,0.441558,0,0,0,0,0,0,0,1,0,...,0.081528,0,1,0,0.566667,2,0.057202,0.000000,0.647953,0.014545
25314,0.220779,0,1,0,0,0,0,0,0,0,...,0.075527,1,0,-1,0.400000,5,0.243494,0.018519,0.000000,0.000000
25315,0.246753,0,0,1,0,0,0,0,0,0,...,0.073403,0,0,0,0.566667,11,0.299923,0.018519,0.000000,0.000000


In [27]:
data = one_hot(data, df, 'poutcome')
data

Unnamed: 0,age,job0,job1,job2,job3,job4,job5,job6,job7,job8,...,day,month,duration,campaign,pdays,previous,poutcome0,poutcome1,poutcome2,poutcome3
0,0.324675,0,0,0,0,1,0,0,0,0,...,0.266667,5,0.038650,0.018519,0.000000,0.000000,0,0,0,1
1,0.311688,0,0,0,0,0,0,0,0,0,...,0.200000,4,0.025509,0.000000,0.294737,0.007273,0,1,0,0
2,0.376623,1,0,0,0,0,0,0,0,0,...,0.433333,7,0.019840,0.018519,0.000000,0.000000,0,0,0,1
3,0.129870,0,0,0,0,1,0,0,0,0,...,0.566667,7,0.044834,0.018519,0.000000,0.000000,0,0,0,1
4,0.311688,0,0,0,0,0,0,0,0,0,...,0.666667,5,0.048183,0.074074,0.000000,0.000000,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25312,0.480519,0,1,0,0,0,0,0,0,0,...,0.433333,5,0.220046,0.018519,0.422222,0.003636,1,0,0,0
25313,0.441558,0,0,0,0,0,0,0,1,0,...,0.566667,2,0.057202,0.000000,0.647953,0.014545,1,0,0,0
25314,0.220779,0,1,0,0,0,0,0,0,0,...,0.400000,5,0.243494,0.018519,0.000000,0.000000,0,0,0,1
25315,0.246753,0,0,1,0,0,0,0,0,0,...,0.566667,11,0.299923,0.018519,0.000000,0.000000,0,0,0,1


In [28]:
data['y'] = df['y']
data

Unnamed: 0,age,job0,job1,job2,job3,job4,job5,job6,job7,job8,...,month,duration,campaign,pdays,previous,poutcome0,poutcome1,poutcome2,poutcome3,y
0,0.324675,0,0,0,0,1,0,0,0,0,...,5,0.038650,0.018519,0.000000,0.000000,0,0,0,1,0
1,0.311688,0,0,0,0,0,0,0,0,0,...,4,0.025509,0.000000,0.294737,0.007273,0,1,0,0,0
2,0.376623,1,0,0,0,0,0,0,0,0,...,7,0.019840,0.018519,0.000000,0.000000,0,0,0,1,0
3,0.129870,0,0,0,0,1,0,0,0,0,...,7,0.044834,0.018519,0.000000,0.000000,0,0,0,1,0
4,0.311688,0,0,0,0,0,0,0,0,0,...,5,0.048183,0.074074,0.000000,0.000000,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25312,0.480519,0,1,0,0,0,0,0,0,0,...,5,0.220046,0.018519,0.422222,0.003636,1,0,0,0,1
25313,0.441558,0,0,0,0,0,0,0,1,0,...,2,0.057202,0.000000,0.647953,0.014545,1,0,0,0,1
25314,0.220779,0,1,0,0,0,0,0,0,0,...,5,0.243494,0.018519,0.000000,0.000000,0,0,0,1,1
25315,0.246753,0,0,1,0,0,0,0,0,0,...,11,0.299923,0.018519,0.000000,0.000000,0,0,0,1,1
