## Missing value imputation for "poutcome", "education", "job" and "contact"

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('F:/capstone_project/Group_2_bank/bank-full - Copy.csv')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
df.shape

(45211, 17)

### 1. for poutcome

In [5]:
df.loc[(df['previous'] !=0) & (df['poutcome']=='unknown') , 'poutcome'] = 'to_be_changed'

In [6]:
df[df.loc[:,'poutcome'] =='to_be_changed'] 

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
40658,61,retired,married,tertiary,no,3140,yes,yes,cellular,6,aug,975,4,98,1,to_be_changed,yes
41821,39,management,married,tertiary,no,184,no,no,cellular,15,oct,206,1,168,5,to_be_changed,no
42042,26,admin.,single,secondary,no,338,no,no,cellular,29,oct,209,1,188,2,to_be_changed,yes
43978,30,technician,single,secondary,no,254,yes,yes,cellular,24,jun,167,3,416,2,to_be_changed,no
45021,37,management,married,secondary,no,209,no,no,cellular,14,oct,183,3,528,7,to_be_changed,no


In [7]:
df.loc[df['poutcome'] =='unknown', 'poutcome' ] = 'not_contacted'

In [8]:
df_pout = df[(df['job'] != 'unknown') & (df['education'] !='unknown') & (df['contact'] !='unknown')]

In [9]:
df_pout.drop('y', axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [10]:
df_pout_temp = pd.get_dummies(df_pout.loc[:,'age':'previous'], drop_first = True)

In [11]:
df_pout_temp = pd.concat([df_pout_temp,df_pout['poutcome']], axis = 1)

In [12]:
df_pout_temp_new = df_pout_temp[df_pout_temp['poutcome']!='to_be_changed']

In [13]:
X_pout = df_pout_temp_new.drop('poutcome', axis = 1)
y_pout = df_pout_temp_new['poutcome']

In [14]:
X_pout.shape, y_pout.shape

((30902, 36), (30902,))

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_pout_train, X_pout_test, y_pout_train, y_pout_test = train_test_split(X_pout, y_pout, test_size=0.3, random_state=3)

In [17]:
X_pout_train.shape, X_pout_test.shape, y_pout_train.shape, y_pout_test.shape

((21631, 36), (9271, 36), (21631,), (9271,))

In [18]:
import lightgbm as lgb

In [19]:
lgbc = lgb.LGBMClassifier(objective = 'multi:softmax')

In [20]:
lgbc.fit(X_pout_train, y_pout_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31,
               objective='multi:softmax', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [21]:
y_pout_pred = lgbc.predict(X_pout_test)

In [23]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [24]:
# for evaluation of model
print('confusion matrix:\n',confusion_matrix (y_pout_test, y_pout_pred))
print('overall accuracy: ',accuracy_score(y_pout_test, y_pout_pred))
print('Classification Report:\n',classification_report(y_pout_test, y_pout_pred))

confusion matrix:
 [[1214    0   52  103]
 [   0 6938    0    0]
 [ 418    0   77   60]
 [ 158    0   16  235]]
overall accuracy:  0.9129543738539532
Classification Report:
                precision    recall  f1-score   support

      failure       0.68      0.89      0.77      1369
not_contacted       1.00      1.00      1.00      6938
        other       0.53      0.14      0.22       555
      success       0.59      0.57      0.58       409

     accuracy                           0.91      9271
    macro avg       0.70      0.65      0.64      9271
 weighted avg       0.91      0.91      0.90      9271



In [25]:
X_pout_for_imp = df_pout_temp[df_pout_temp['poutcome']=='to_be_changed'].drop('poutcome', axis=1)
X_pout_for_imp.shape

(5, 36)

In [26]:
y_pout_pred_for_imp = lgbc.predict(X_pout_for_imp)

In [27]:
y_pout_pred_for_imp

array(['success', 'failure', 'success', 'failure', 'failure'],
      dtype=object)

In [28]:
df.loc[df['poutcome']=='to_be_changed','poutcome'] = y_pout_pred_for_imp

In [29]:
df.iloc[[40658,41821,42042,43978,45021],:]

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
40658,61,retired,married,tertiary,no,3140,yes,yes,cellular,6,aug,975,4,98,1,success,yes
41821,39,management,married,tertiary,no,184,no,no,cellular,15,oct,206,1,168,5,failure,no
42042,26,admin.,single,secondary,no,338,no,no,cellular,29,oct,209,1,188,2,success,yes
43978,30,technician,single,secondary,no,254,yes,yes,cellular,24,jun,167,3,416,2,failure,no
45021,37,management,married,secondary,no,209,no,no,cellular,14,oct,183,3,528,7,failure,no


### 2. for education

In [31]:
df_edu = df.drop(columns = ['job','contact','y'], axis= 1)

In [32]:
df_edu = pd.concat([pd.get_dummies(data = df_edu.drop('education', axis = 1), drop_first = True), df['education']],
                   axis =1)

In [34]:
df_edu_new = df_edu[df_edu['education']!='unknown']
df_edu_new.shape

(43354, 27)

In [35]:
X_edu = df_edu_new.drop('education', axis = 1)
y_edu = df_edu_new['education']

In [36]:
X_edu_train, X_edu_test, y_edu_train, y_edu_test = train_test_split(X_edu, y_edu, test_size=0.3, random_state=3)

In [37]:
X_edu_train.shape, X_edu_test.shape, y_edu_train.shape, y_edu_test.shape

((30347, 26), (13007, 26), (30347,), (13007,))

In [38]:
lgbc.fit(X_edu_train, y_edu_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31,
               objective='multi:softmax', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [39]:
y_edu_pred = lgbc.predict(X_edu_test)

In [40]:
# for evaluation of model
print('confusion matrix:\n',confusion_matrix (y_edu_test, y_edu_pred))
print('overall accuracy: ',accuracy_score(y_edu_test, y_edu_pred))
print('Classification Report:\n',classification_report(y_edu_test, y_edu_pred))

confusion matrix:
 [[ 111 1865   84]
 [ 101 6088  749]
 [  39 2894 1076]]
overall accuracy:  0.5593142154224648
Classification Report:
               precision    recall  f1-score   support

     primary       0.44      0.05      0.10      2060
   secondary       0.56      0.88      0.68      6938
    tertiary       0.56      0.27      0.36      4009

    accuracy                           0.56     13007
   macro avg       0.52      0.40      0.38     13007
weighted avg       0.54      0.56      0.49     13007



In [42]:
X_edu_for_imp = df_edu[df_edu['education']=='unknown'].drop('education', axis=1)
X_edu_for_imp.shape

(1857, 26)

In [43]:
y_edu_pred_for_imp = lgbc.predict(X_edu_for_imp)

In [44]:
pd.Series(y_edu_pred_for_imp).value_counts()

secondary    1569
tertiary      202
primary        86
dtype: int64

In [45]:
df.loc[df['education']=='unknown','education'] = y_edu_pred_for_imp

### 3. for job

In [47]:
df_job = df.drop(columns = ['contact','y'], axis= 1)

In [48]:
df_job = pd.concat([pd.get_dummies(data = df_job.drop('job', axis = 1), drop_first = True), df['job']] ,axis =1)

In [50]:
df_job_new = df_job[df_job['job']!='unknown']
df_job_new.shape

(44923, 29)

In [51]:
X_job = df_job_new.drop('job', axis = 1)
y_job = df_job_new['job']

In [52]:
X_job_train, X_job_test, y_job_train, y_job_test = train_test_split(X_job, y_job, test_size=0.3, random_state=3)

In [53]:
X_job_train.shape, X_job_test.shape, y_job_train.shape, y_job_test.shape

((31446, 28), (13477, 28), (31446,), (13477,))

In [54]:
lgbc.fit(X_job_train, y_job_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31,
               objective='multi:softmax', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [55]:
y_job_pred = lgbc.predict(X_job_test)

In [56]:
# for evaluation of model
print('confusion matrix:\n',confusion_matrix (y_job_test, y_job_pred))
print('overall accuracy: ',accuracy_score(y_job_test, y_job_pred))
print('Classification Report:\n',classification_report(y_job_test, y_job_pred))

confusion matrix:
 [[ 275  689    1    1  184   73    1   47   24  260    6]
 [ 202 2196    0   16   51  105    1   40   22  225    3]
 [  28  168    0    2  208   18    0    8    0   40    2]
 [  16  194    0   27   49   62    0    1    1   35    0]
 [  62  233    0    1 2324   68    1    4    9  104    1]
 [  18  107    0   11   65  409    0    0    0   28    1]
 [  26  114    0    2  261   15    0    5    5   58    1]
 [ 178  776    0    1   81   34    0   45   12  181    5]
 [  32   34    0    0   78    0    1    9  149   19    0]
 [ 224  730    0    4  552   76    1   38   20  595   11]
 [  41  138    0    1   85   25    0    1    6   66   14]]
overall accuracy:  0.44772575498998296
Classification Report:
                precision    recall  f1-score   support

       admin.       0.25      0.18      0.21      1561
  blue-collar       0.41      0.77      0.53      2861
 entrepreneur       0.00      0.00      0.00       474
    housemaid       0.41      0.07      0.12       385
   

In [57]:
X_job_for_imp = df_job[df_job['job']=='unknown'].drop('job', axis=1)

In [58]:
y_job_pred_for_imp = lgbc.predict(X_job_for_imp)

In [59]:
pd.Series(y_job_pred_for_imp).value_counts()

blue-collar    84
retired        56
technician     52
management     49
admin.         31
student         6
housemaid       6
services        2
unemployed      2
dtype: int64

In [60]:
df.loc[df['job']=='unknown','job'] = y_job_pred_for_imp

### 4. for contact

In [62]:
df_contact = df.drop(columns = ['y'], axis= 1)

In [63]:
df_contact = pd.concat([pd.get_dummies(data = df_contact.drop('contact', axis = 1), drop_first = True), df['contact']]
                       ,axis =1)

In [64]:
df_contact_new = df_contact[df_contact['contact']!='unknown']
df_contact_new.shape

(32191, 39)

In [65]:
X_contact = df_contact_new.drop('contact', axis = 1)
y_contact = df_contact_new['contact']

In [66]:
X_contact_train, X_contact_test, y_contact_train, y_contact_test = train_test_split(X_contact, y_contact, 
                                                                                    test_size=0.3, random_state=3)

In [67]:
X_contact_train.shape, X_contact_test.shape, y_contact_train.shape, y_contact_test.shape

((22533, 38), (9658, 38), (22533,), (9658,))

In [68]:
lgbc2 = lgb.LGBMClassifier()
lgbc2.fit(X_contact_train,y_contact_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [69]:
y_contact_pred = lgbc2.predict(X_contact_test)

In [70]:
# for evaluation of model
print('confusion matrix:\n',confusion_matrix (y_contact_test, y_contact_pred))
print('overall accuracy: ',accuracy_score(y_contact_test, y_contact_pred))
print('Classification Report:\n',classification_report(y_contact_test, y_contact_pred))

confusion matrix:
 [[8737   57]
 [ 795   69]]
overall accuracy:  0.9117829778422033
Classification Report:
               precision    recall  f1-score   support

    cellular       0.92      0.99      0.95      8794
   telephone       0.55      0.08      0.14       864

    accuracy                           0.91      9658
   macro avg       0.73      0.54      0.55      9658
weighted avg       0.88      0.91      0.88      9658



In [71]:
X_contact_for_imp = df_contact[df_contact['contact']=='unknown'].drop('contact', axis=1)

In [72]:
y_contact_pred_for_imp = lgbc2.predict(X_contact_for_imp)

In [73]:
pd.Series(y_contact_pred_for_imp).value_counts()

cellular     12881
telephone      139
dtype: int64

In [74]:
df.loc[df['contact']=='unknown','contact'] = y_contact_pred_for_imp

### All 4 done