# Project - Predicting if a customer would subscribe a term deposit

## Importing necessary libraries 

In [149]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix


## Loading and Processing Data

In [150]:
df_bank = pd.read_csv('bank-additional-full.csv',sep=';')

### Renaming the Columns

In [151]:
df_bank.columns

df_bank= df_bank.rename(columns={'default':'c_default', 'housing':'h_loan','loan':"p_loan",'contact':'cont_type','month':'lc_month','day_of_week':'lc_day'
               ,'duration':'lc_duration','campaign':'no_of_contact',"pdays":'no_day_after_lc','previous':'no_of_contact_prev','emp.var.rate':'emp_var_rate','cons.price.idx':'cons_price_index','cons.conf.idx':'cons_conf_idx','euribor3m':'i_euribor3','nr.employed':'no_employee'})

In [152]:
len(df_bank.columns) #20 Features, 1 Target variable

21

### Deleting Columns

In [153]:
df_bank.drop(['lc_duration'],axis=1,inplace=True)       #Dropping lc_duration column since, y  is automatically known when duration is known (after the call)

In [154]:
df_bank.drop(['c_default'], axis =1, inplace= True) #Dropping c_default column, since there are just 3 records that has credit default 
                                                        #Doesn't add extra information 

In [155]:
len(df_bank.columns)

19

In [156]:
df_bank

Unnamed: 0,age,job,marital,education,h_loan,p_loan,cont_type,lc_month,lc_day,no_of_contact,no_day_after_lc,no_of_contact_prev,poutcome,emp_var_rate,cons_price_index,cons_conf_idx,i_euribor3,no_employee,y
0,56,housemaid,married,basic.4y,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,yes,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,yes,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,yes,no,cellular,nov,fri,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,cellular,nov,fri,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,yes,no,cellular,nov,fri,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,cellular,nov,fri,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


### Replacing 'unknown' values with mode of the education column

In [157]:
df_bank['education'] = [df_bank['education'].mode()[0] if x== 'unknown' else x for x in df_bank['education']]

### Label encoding of ordinal education values

In [158]:
education = []

for value in df_bank['education']:
    if value == 'illiterate':
        education.append('1')
    elif value == 'basic.4y':
        education.append('2')
    elif value == 'basic.6y':
        education.append('3')
    elif value == 'basic.9y':
        education.append('4')
    elif value == 'high.school':
        education.append('5')
    elif value == 'professional.course':
        education.append('6')
    elif value == 'university.degree':
        education.append('7')
df_bank['education'] = education

### Dropping duplicates

In [159]:
df_bank.drop_duplicates(inplace=True) 

In [166]:
df_bank

Unnamed: 0,age,job,marital,education,h_loan,p_loan,cont_type,lc_month,lc_day,no_of_contact,no_day_after_lc,no_of_contact_prev,poutcome,emp_var_rate,cons_price_index,cons_conf_idx,i_euribor3,no_employee,y
0,56,housemaid,married,2,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,5,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,5,yes,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,3,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,5,no,yes,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,6,yes,no,cellular,nov,fri,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,6,no,no,cellular,nov,fri,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,7,yes,no,cellular,nov,fri,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,6,no,no,cellular,nov,fri,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [167]:
df_bank['education'] = df_bank['education'].astype('int')

In [172]:
df_bank

Unnamed: 0,age,job,marital,education,h_loan,p_loan,cont_type,lc_month,lc_day,no_of_contact,no_day_after_lc,no_of_contact_prev,poutcome,emp_var_rate,cons_price_index,cons_conf_idx,i_euribor3,no_employee,y
0,56,housemaid,married,2,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,5,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,5,yes,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,3,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,5,no,yes,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,6,yes,no,cellular,nov,fri,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,6,no,no,cellular,nov,fri,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,7,yes,no,cellular,nov,fri,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,6,no,no,cellular,nov,fri,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [176]:
df_bank['marital'] = [df_bank['marital'].mode()[0] if x == 'unknown' else x for x in df_bank['marital']]

#Replacing unknown values in marital column with the mode since there were just 73 unknowns

In [177]:
df_bank['marital'].value_counts()

married     23758
single      10962
divorced     4455
Name: marital, dtype: int64

In [193]:
df_bank['job'].value_counts()

admin.           9817
blue-collar      8743
technician       6369
services         3775
management       2810
retired          1678
entrepreneur     1402
self-employed    1384
housemaid        1028
unemployed        992
student           852
unknown           325
Name: job, dtype: int64

In [124]:
df_bank_num = df_bank.select_dtypes(include=np.number)

In [125]:
df_bank_num

Unnamed: 0,age,no_of_contact,no_day_after_lc,no_of_contact_prev,emp_var_rate,cons_price_index,cons_conf_idx,i_euribor3,no_employee
0,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0
1,57,1,999,0,1.1,93.994,-36.4,4.857,5191.0
2,37,1,999,0,1.1,93.994,-36.4,4.857,5191.0
3,40,1,999,0,1.1,93.994,-36.4,4.857,5191.0
4,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0
...,...,...,...,...,...,...,...,...,...
41183,73,1,999,0,-1.1,94.767,-50.8,1.028,4963.6
41184,46,1,999,0,-1.1,94.767,-50.8,1.028,4963.6
41185,56,2,999,0,-1.1,94.767,-50.8,1.028,4963.6
41186,44,1,999,0,-1.1,94.767,-50.8,1.028,4963.6


In [126]:
df_bank_cat = df_bank.drop(df_bank_num.columns,axis=1)

In [127]:
df_bank_num.isna().value_counts()

age    no_of_contact  no_day_after_lc  no_of_contact_prev  emp_var_rate  cons_price_index  cons_conf_idx  i_euribor3  no_employee
False  False          False            False               False         False             False          False       False          39404
dtype: int64

In [128]:
df_bank_client = df_bank.iloc[:,:7]

In [129]:
df_bank_client.to_csv('client_data.csv')

In [130]:
df_bank_num

Unnamed: 0,age,no_of_contact,no_day_after_lc,no_of_contact_prev,emp_var_rate,cons_price_index,cons_conf_idx,i_euribor3,no_employee
0,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0
1,57,1,999,0,1.1,93.994,-36.4,4.857,5191.0
2,37,1,999,0,1.1,93.994,-36.4,4.857,5191.0
3,40,1,999,0,1.1,93.994,-36.4,4.857,5191.0
4,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0
...,...,...,...,...,...,...,...,...,...
41183,73,1,999,0,-1.1,94.767,-50.8,1.028,4963.6
41184,46,1,999,0,-1.1,94.767,-50.8,1.028,4963.6
41185,56,2,999,0,-1.1,94.767,-50.8,1.028,4963.6
41186,44,1,999,0,-1.1,94.767,-50.8,1.028,4963.6


In [131]:
df_bank_num['age']

0        56
1        57
2        37
3        40
4        56
         ..
41183    73
41184    46
41185    56
41186    44
41187    74
Name: age, Length: 39404, dtype: int64

In [132]:
df_bank_cat

Unnamed: 0,job,marital,education,c_default,h_loan,p_loan,cont_type,lc_month,lc_day,poutcome,y
0,housemaid,married,basic.4y,no,no,no,telephone,may,mon,nonexistent,no
1,services,married,high.school,unknown,no,no,telephone,may,mon,nonexistent,no
2,services,married,high.school,no,yes,no,telephone,may,mon,nonexistent,no
3,admin.,married,basic.6y,no,no,no,telephone,may,mon,nonexistent,no
4,services,married,high.school,no,no,yes,telephone,may,mon,nonexistent,no
...,...,...,...,...,...,...,...,...,...,...,...
41183,retired,married,professional.course,no,yes,no,cellular,nov,fri,nonexistent,yes
41184,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,nonexistent,no
41185,retired,married,university.degree,no,yes,no,cellular,nov,fri,nonexistent,no
41186,technician,married,professional.course,no,no,no,cellular,nov,fri,nonexistent,yes


In [134]:
df_bank_cat['education'].value_counts()

university.degree      13247
high.school             9121
basic.9y                5785
professional.course     5018
basic.4y                3993
basic.6y                2222
illiterate                18
Name: education, dtype: int64

In [139]:
df_bank_cat

Unnamed: 0,job,marital,education,c_default,h_loan,p_loan,cont_type,lc_month,lc_day,poutcome,y
0,housemaid,married,2,no,no,no,telephone,may,mon,nonexistent,no
1,services,married,5,unknown,no,no,telephone,may,mon,nonexistent,no
2,services,married,5,no,yes,no,telephone,may,mon,nonexistent,no
3,admin.,married,3,no,no,no,telephone,may,mon,nonexistent,no
4,services,married,5,no,no,yes,telephone,may,mon,nonexistent,no
...,...,...,...,...,...,...,...,...,...,...,...
41183,retired,married,6,no,yes,no,cellular,nov,fri,nonexistent,yes
41184,blue-collar,married,6,no,no,no,cellular,nov,fri,nonexistent,no
41185,retired,married,7,no,yes,no,cellular,nov,fri,nonexistent,no
41186,technician,married,6,no,no,no,cellular,nov,fri,nonexistent,yes


In [195]:
df_bank_cat['marital'] = df_bank_cat['marital'].mod()[0] if x == 'unknown' else x for x in df_bank_cat['marital']

married     23869
single      10997
divorced     4459
unknown        79
Name: marital, dtype: int64

- age (numeric)- job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')
- marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)- education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')
- default: has credit in default? (categorical: 'no','yes','unknown')
- housing: has housing loan? (categorical: 'no','yes','unknown')
- loan: has personal loan? (categorical: 'no','yes','unknown')
- contact: contact communication type (categorical: 'cellular','telephone')
- month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
- day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')
- duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
- campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
- pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
- previous: number of contacts performed before this campaign and for this client (numeric)
- poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')
- emp.var.rate: employment variation rate - quarterly indicator (numeric)
- cons.price.idx: consumer price index - monthly indicator (numeric)
- cons.conf.idx: consumer confidence index - monthly indicator (numeric)
- euribor3m: euribor 3 month rate - daily indicator (numeric)
- nr.employed: number of employees - quarterly indicator (numeric)