   Input variables:
   ## bank client data:
   1 - age (numeric)
   
   2 - job : type of job (categorical: "admin.","unknown","unemployed","management","housemaid","entrepreneur","student",
                                       "blue-collar","self-employed","retired","technician","services") 
                                       
   3 - marital : marital status (categorical: "married","divorced","single"; note: "divorced" means divorced or widowed)
   
   4 - education (categorical: "unknown","secondary","primary","tertiary")
   
   5 - default: has credit in default? (binary: "yes","no")
   
   6 - balance: average yearly balance, in euros (numeric) 
   
   7 - housing: has housing loan? (binary: "yes","no")
   
   8 - loan: has personal loan? (binary: "yes","no")
   
   ## related with the last contact of the current campaign:
   
   9 - contact: contact communication type (categorical: "unknown","telephone","cellular") 
   
  10 - day: last contact day of the month (numeric)
  
  11 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")
  
  12 - duration: last contact duration, in seconds (numeric)
  
   ## other attributes:
   
  13 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
  
  14 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
  
  15 - previous: number of contacts performed before this campaign and for this client (numeric)
  
  16 - poutcome: outcome of the previous marketing campaign (categorical: "unknown","other","failure","success")

  Output variable (desired target):

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
import warnings
warnings.filterwarnings("ignore")
import statsmodels
from sklearn.preprocessing import (LabelEncoder,
OrdinalEncoder,
StandardScaler)
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv('bank-full.csv', delimiter=';')

df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


## Data Preprocessing

#### Convert Age Features into group.

In [3]:
# https://thefinancialbrand.com/61246/age-consumer-behavior-patterns-banking/

df['age_group'] = df['age'].apply(lambda x : '17-24' if x < 25 else '25-34'
                                  if x < 35 else '35-44'
                                  if x < 45 else '45-54'
                                  if x < 55 else '55-64'
                                  if x < 65 else '65+')

# 65 years included in 65+ category.

#### Convert Pdays Features into group.

In [4]:
# Karena -1 merupakan indikator dan yang lain nya merupakan value, sehingga akan saya grouping terlebih dahulu.

df['pdays'].unique()

array([ -1, 151, 166,  91,  86, 143, 147,  89, 140, 176, 101, 174, 170,
       167, 195, 165, 129, 188, 196, 172, 118, 119, 104, 171, 117, 164,
       132, 131, 123, 159, 186, 111, 115, 116, 173, 178, 110, 152,  96,
       103, 150, 175, 193, 181, 185, 154, 145, 138, 126, 180, 109, 158,
       168,  97, 182, 127, 130, 194, 125, 105, 102,  26, 179,  28, 183,
       155, 112, 120, 137, 124, 187, 190, 113, 162, 134, 169, 189,   8,
       144, 191, 184, 177,   5,  99, 133,  93,  92,  10, 100, 156, 198,
       106, 153, 146, 128,   7, 121, 160, 107,  90,  27, 197, 136, 139,
       122, 157, 149, 135,  30, 114,  98, 192, 163,  34,  95, 141,  31,
       199,  94, 108,  29, 268, 247, 253, 226, 244, 239, 245, 204, 231,
       238, 258, 230, 254, 265,  71, 223, 246, 250, 266, 240, 205, 261,
       259, 241, 260, 234, 251, 225, 161, 237, 262, 248, 255, 220, 227,
       206, 224, 249, 235, 228, 263,   2, 270, 232, 252, 207, 200, 269,
       233, 256, 273, 272, 242, 264, 208, 214, 222, 271, 203, 22

In [5]:
# Mencoba membagi tanpa value -1.

cari_bin = [151, 166,  91,  86, 143, 147,  89, 140, 176, 101, 174, 170,
       167, 195, 165, 129, 188, 196, 172, 118, 119, 104, 171, 117, 164,
       132, 131, 123, 159, 186, 111, 115, 116, 173, 178, 110, 152,  96,
       103, 150, 175, 193, 181, 185, 154, 145, 138, 126, 180, 109, 158,
       168,  97, 182, 127, 130, 194, 125, 105, 102,  26, 179,  28, 183,
       155, 112, 120, 137, 124, 187, 190, 113, 162, 134, 169, 189,   8,
       144, 191, 184, 177,   5,  99, 133,  93,  92,  10, 100, 156, 198,
       106, 153, 146, 128,   7, 121, 160, 107,  90,  27, 197, 136, 139,
       122, 157, 149, 135,  30, 114,  98, 192, 163,  34,  95, 141,  31,
       199,  94, 108,  29, 268, 247, 253, 226, 244, 239, 245, 204, 231,
       238, 258, 230, 254, 265,  71, 223, 246, 250, 266, 240, 205, 261,
       259, 241, 260, 234, 251, 225, 161, 237, 262, 248, 255, 220, 227,
       206, 224, 249, 235, 228, 263,   2, 270, 232, 252, 207, 200, 269,
       233, 256, 273, 272, 242, 264, 208, 214, 222, 271, 203, 221, 202,
       216, 201, 257, 229, 210, 217,  75, 213,  73,  76, 267, 211, 215,
        77, 236,  82,   6, 209, 274,   1, 243, 212, 275,  80, 276,   9,
       279,  12, 280,  88, 277,  85,  84, 219,  24,  21, 282,  41, 294,
        49, 329, 307, 303, 331, 308, 300,  64, 314, 287, 330, 332, 302,
       323, 318, 333,  60, 326, 335, 313, 312, 305, 325, 327, 336, 309,
       328, 322,  39, 316, 292, 295, 310, 306, 320, 317, 289,  57, 321,
       142, 339, 301, 315, 337, 334, 340, 319,  17,  74, 148, 341, 299,
       344, 342, 324, 345, 346, 304, 281, 343, 338,  14, 347,  15, 291,
       348, 349, 285, 350, 284,  25, 283, 278,  81,   4,  87,  83,  79,
        70,  13, 293,  37,  78,  63,  22, 296, 355,  66,  19,  35, 360,
       357, 354, 351, 362, 358, 365, 298, 286, 364, 363,  47, 361, 288,
       366, 356, 352, 359, 297, 367, 353, 368,  42, 290,  67, 371, 370,
       369,  50,  36, 373, 374, 372, 311, 375, 378,  59, 379,  40,  18,
        43,  20,  69,  38, 385,  56,  55,  44, 391,  72, 390,  32,  62,
       399, 393,  65, 377, 395, 388, 389, 386,  61, 412, 405, 434, 394,
       382, 459, 440, 397, 383,  68, 461, 462, 463, 422,  51, 457, 430,
       442, 403, 454, 428, 392, 410, 401, 474, 475, 477, 478,  54, 476,
       380, 479,  45,  46, 495,  58,  48, 518,  52, 515, 520, 511, 536,
       387, 218,  33, 544, 435, 436, 555, 433, 446, 558, 469, 616, 561,
       553, 384, 592, 467, 585, 480, 421, 667, 626, 426, 595, 381, 376,
       648, 521, 452, 449, 633, 398,  53, 460, 670, 551, 414, 557, 687,
       404, 651, 686, 425, 504, 578, 674, 416, 586, 411, 756, 450, 745,
       514, 417, 424, 776, 396, 683, 529, 439, 415, 456, 407, 458, 532,
       481, 791, 701, 531, 792, 413, 445, 535, 784, 419, 455, 491, 431,
       542, 470, 472, 717, 437,   3, 782, 728, 828, 524, 562, 761, 492,
       775, 579, 493, 464, 760, 466, 465, 656, 831, 490, 432, 655, 427,
       749, 838, 769, 587, 778, 854, 779, 850, 771, 594, 842, 589, 603,
       484, 489, 486, 409, 444, 680, 808, 485, 503, 690, 772, 774, 526,
       420, 528, 500, 826, 804, 508, 547, 805, 541, 543, 871, 550, 530]

In [6]:
pd.qcut(cari_bin, q=4)

[(143.25, 282.5], (143.25, 282.5], (0.999, 143.25], (0.999, 143.25], (0.999, 143.25], ..., (427.75, 871.0], (427.75, 871.0], (427.75, 871.0], (427.75, 871.0], (427.75, 871.0]]
Length: 558
Categories (4, interval[float64]): [(0.999, 143.25] < (143.25, 282.5] < (282.5, 427.75] < (427.75, 871.0]]

In [7]:
df['pdays_group'] = df['pdays'].apply(lambda x : 'Not Previously Contacted' if x < 0 else '1 to 143 days'
                                  if x < 144 else '144 to 282 days'
                                  if x < 283 else '283 to 427 days'
                                  if x < 428 else 'More than 428 days')

# 428 days included in More than 428 days

## Data Cleaning, handle 'Unknown' Value in a Feature.

- A rule of thumb is that if you are removing less than 5 percent of the observations you are free to
  just remove all observations that have missing values. (Source: Udemy 365 Data Science). 
  
- Schafer ( 1999 ) asserted that a missing rate of 5% or less is inconsequential

- Features dengan proporsi missing value yang besar, akan di drop.

#### Unknown value in Job Feature.

In [8]:
df['job'].unique()

array(['management', 'technician', 'entrepreneur', 'blue-collar',
       'unknown', 'retired', 'admin.', 'services', 'self-employed',
       'unemployed', 'housemaid', 'student'], dtype=object)

In [9]:
df[df['job']=='unknown']['job'].count()

288

In [10]:
print(f'unknown pada kolom job sebesar {round(288/len(df)*100,2)} %')

unknown pada kolom job sebesar 0.64 %


In [11]:
df['job'] = df['job'].apply(lambda x : np.NaN if x == 'unknown' else x)

In [12]:
df['job'].isnull().sum()

288

#### Unknown value in Education Feature.

In [13]:
df['education'].unique()

array(['tertiary', 'secondary', 'unknown', 'primary'], dtype=object)

In [14]:
df[df['education']=='unknown']['education'].count()

1857

In [15]:
print(f'unknown pada kolom education sebesar {round(1857/len(df)*100,2)} %')

unknown pada kolom education sebesar 4.11 %


In [16]:
df['education'] = df['education'].apply(lambda x : np.NaN if x == 'unknown' else x)

In [17]:
df['education'].isnull().sum()

1857

#### Unknown value in poutcome Feature.

In [18]:
df['poutcome'].unique()

array(['unknown', 'failure', 'other', 'success'], dtype=object)

In [19]:
df[df['poutcome']=='unknown']['poutcome'].count()

36959

In [20]:
print(f'unknown pada kolom poutcome sebesar {round(36959/len(df)*100,2)} %')

unknown pada kolom poutcome sebesar 81.75 %


- Dalam hal ini saya akan menghapus feature ini untuk digunakan dalam tahap modelling.

#### Unknown value in Contact Feature.

In [21]:
df['contact'].unique()

array(['unknown', 'cellular', 'telephone'], dtype=object)

In [22]:
df[df['contact']=='unknown']['contact'].count()

13020

In [23]:
print(f'unknown pada kolom contact sebesar {round(13020/len(df)*100,2)} %')

unknown pada kolom contact sebesar 28.8 %


- Dalam hal ini, untuk mengisi missing value dari feature Contact, saya akan mencoba untuk menggunakan klasifikasi, dikarenakan jumlah missing value yang tidak terlalu besar atau kecil.

#### Drop.

In [24]:
# Drop NaN value

df = df.dropna(axis=0)

df.isnull().sum()

age            0
job            0
marital        0
education      0
default        0
balance        0
housing        0
loan           0
contact        0
day            0
month          0
duration       0
campaign       0
pdays          0
previous       0
poutcome       0
y              0
age_group      0
pdays_group    0
dtype: int64

In [25]:
# Drop poutcome feature.

df = df.drop(columns=['poutcome'])

In [26]:
df = df.drop(columns=['age','pdays'])

## Standardization

In [27]:
df_std_mv = df.copy()

In [28]:
scaler_mv = StandardScaler()

In [29]:
scaler_mv.fit(df_std_mv[['balance', 'day', 'duration', 'campaign','previous']])

StandardScaler(copy=True, with_mean=True, with_std=True)

In [30]:
df_std_mv[['balance', 'day', 'duration', 'campaign','previous']] = scaler_mv.transform(df_std_mv[['balance', 'day', 'duration', 'campaign', 'previous']])

df_std_mv

Unnamed: 0,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,y,age_group,pdays_group
0,management,married,tertiary,no,0.259354,yes,no,unknown,-1.301418,may,0.010368,-0.573827,-0.250730,no,55-64,Not Previously Contacted
1,technician,single,secondary,no,-0.435568,yes,no,unknown,-1.301418,may,-0.415726,-0.573827,-0.250730,no,35-44,Not Previously Contacted
2,entrepreneur,married,secondary,no,-0.444443,yes,yes,unknown,-1.301418,may,-0.706245,-0.573827,-0.250730,no,25-34,Not Previously Contacted
5,management,married,tertiary,no,-0.369166,yes,no,unknown,-1.301418,may,-0.462209,-0.573827,-0.250730,no,35-44,Not Previously Contacted
6,management,single,tertiary,no,-0.298161,yes,yes,unknown,-1.301418,may,-0.160070,-0.573827,-0.250730,no,25-34,Not Previously Contacted
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,technician,married,tertiary,no,-0.173904,no,no,cellular,0.143343,nov,2.783852,0.078925,-0.250730,yes,45-54,Not Previously Contacted
45207,retired,divorced,primary,no,0.123262,no,no,cellular,0.143343,nov,0.765716,-0.247451,-0.250730,yes,65+,Not Previously Contacted
45208,retired,married,secondary,no,1.433555,no,no,cellular,0.143343,nov,3.364890,0.731677,1.035364,yes,65+,144 to 282 days
45209,blue-collar,married,secondary,no,-0.225513,no,no,telephone,0.143343,nov,0.967143,0.405301,-0.250730,no,55-64,Not Previously Contacted


## Label Encoder

In [31]:
df_encode = df_std_mv.copy()

In [32]:
# label encode job,marital,default,housing,pdays_group

labelencoder_X = LabelEncoder()

In [33]:
df_encode['job']      = labelencoder_X.fit_transform(df_encode['job']) 
df_encode['marital']  = labelencoder_X.fit_transform(df_encode['marital']) 
df_encode['default']  = labelencoder_X.fit_transform(df_encode['default']) 
df_encode['housing']  = labelencoder_X.fit_transform(df_encode['housing']) 
df_encode['pdays_group']     = labelencoder_X.fit_transform(df_encode['pdays_group']) 
df_encode['loan']     = labelencoder_X.fit_transform(df_encode['loan']) 
df_encode['y']        = labelencoder_X.fit_transform(df_encode['y'])

In [34]:
labelOE_ed = OrdinalEncoder(categories=[['primary','secondary','tertiary']])
labelOE_mo = OrdinalEncoder(categories=[['jan','feb','mar', 'apr', 'may', 'jun', 'jul', 'aug', \
                                         'sep', 'oct', 'nov', 'dec']])
labelOE_age = OrdinalEncoder(categories=[['17-24','25-34','35-44', '45-54', '55-64', '65+' ]])

In [35]:
df_encode['education'] = labelOE_ed.fit_transform(df_encode[['education']])
df_encode['month'] = labelOE_mo.fit_transform(df_encode[['month']])
df_encode['age_group'] = labelOE_age.fit_transform(df_encode[['age_group']])

In [36]:
df_encode

Unnamed: 0,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,y,age_group,pdays_group
0,4,1,2.0,0,0.259354,1,0,unknown,-1.301418,4.0,0.010368,-0.573827,-0.250730,0,4.0,4
1,9,2,1.0,0,-0.435568,1,0,unknown,-1.301418,4.0,-0.415726,-0.573827,-0.250730,0,2.0,4
2,2,1,1.0,0,-0.444443,1,1,unknown,-1.301418,4.0,-0.706245,-0.573827,-0.250730,0,1.0,4
5,4,1,2.0,0,-0.369166,1,0,unknown,-1.301418,4.0,-0.462209,-0.573827,-0.250730,0,2.0,4
6,4,2,2.0,0,-0.298161,1,1,unknown,-1.301418,4.0,-0.160070,-0.573827,-0.250730,0,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,9,1,2.0,0,-0.173904,0,0,cellular,0.143343,10.0,2.783852,0.078925,-0.250730,1,3.0,4
45207,5,0,0.0,0,0.123262,0,0,cellular,0.143343,10.0,0.765716,-0.247451,-0.250730,1,5.0,4
45208,5,1,1.0,0,1.433555,0,0,cellular,0.143343,10.0,3.364890,0.731677,1.035364,1,5.0,1
45209,1,1,1.0,0,-0.225513,0,0,telephone,0.143343,10.0,0.967143,0.405301,-0.250730,0,4.0,4


In [37]:
# Turn the Ordinal Encoded Value into Integer.

df_encode['education'] = df_encode['education'].apply(lambda x : int(x))
df_encode['month'] = df_encode['month'].apply(lambda x : int(x))
df_encode['age_group'] = df_encode['age_group'].apply(lambda x : int(x))

In [38]:
df_encode.head(3)

Unnamed: 0,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,y,age_group,pdays_group
0,4,1,2,0,0.259354,1,0,unknown,-1.301418,4,0.010368,-0.573827,-0.25073,0,4,4
1,9,2,1,0,-0.435568,1,0,unknown,-1.301418,4,-0.415726,-0.573827,-0.25073,0,2,4
2,2,1,1,0,-0.444443,1,1,unknown,-1.301418,4,-0.706245,-0.573827,-0.25073,0,1,4


## Prepate Data for Test Dataset

In [39]:
df_tes_miss = df_encode[df_encode['contact']=='unknown']

df_tes_miss

Unnamed: 0,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,y,age_group,pdays_group
0,4,1,2,0,0.259354,1,0,unknown,-1.301418,4,0.010368,-0.573827,-0.250730,0,4,4
1,9,2,1,0,-0.435568,1,0,unknown,-1.301418,4,-0.415726,-0.573827,-0.250730,0,2,4
2,2,1,1,0,-0.444443,1,1,unknown,-1.301418,4,-0.706245,-0.573827,-0.250730,0,1,4
5,4,1,2,0,-0.369166,1,0,unknown,-1.301418,4,-0.462209,-0.573827,-0.250730,0,2,4
6,4,2,2,0,-0.298161,1,1,unknown,-1.301418,4,-0.160070,-0.573827,-0.250730,0,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45061,6,2,1,0,-0.106187,0,0,unknown,0.504533,9,-0.973522,-0.573827,-0.250730,0,1,4
45062,5,1,0,0,-0.201188,0,0,unknown,0.504533,9,-0.981269,-0.573827,-0.250730,0,4,4
45122,2,2,2,0,-0.358975,1,1,unknown,1.226913,9,-0.934786,-0.573827,-0.250730,0,2,4
45135,1,1,0,0,-0.019732,0,0,unknown,1.467707,9,-0.725612,-0.573827,-0.250730,0,3,4


In [40]:
df_mv = df_encode.copy()

In [41]:
df_mv['contact'] = df_mv['contact'].apply(lambda x : np.NaN if x == 'unknown' else x)

In [42]:
df_mv = df_mv.dropna(axis=0)

In [43]:
df_mv.head()

Unnamed: 0,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,y,age_group,pdays_group
12657,4,2,1,0,-0.433596,0,0,cellular,-1.421815,6,-0.012873,-0.573827,-0.25073,0,1,4
12658,1,1,0,0,-0.291916,0,0,cellular,-1.421815,6,0.149817,-0.573827,-0.25073,0,3,4
12659,1,1,1,0,-0.410585,0,1,cellular,-1.421815,6,1.586916,-0.247451,-0.25073,0,2,4
12660,9,2,1,0,-0.438855,0,0,telephone,-1.421815,6,-0.748854,-0.247451,-0.25073,0,1,4
12661,9,2,1,0,-0.403682,1,1,cellular,-1.421815,6,0.688245,0.405301,-0.25073,0,1,4


In [44]:
# Cellular = 0, Telephone = 1

df_mv['contact'] = df_mv['contact'].apply(lambda x : 0 if x == 'cellular' else 1)

In [45]:
df_mv.head(5)

Unnamed: 0,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,y,age_group,pdays_group
12657,4,2,1,0,-0.433596,0,0,0,-1.421815,6,-0.012873,-0.573827,-0.25073,0,1,4
12658,1,1,0,0,-0.291916,0,0,0,-1.421815,6,0.149817,-0.573827,-0.25073,0,3,4
12659,1,1,1,0,-0.410585,0,1,0,-1.421815,6,1.586916,-0.247451,-0.25073,0,2,4
12660,9,2,1,0,-0.438855,0,0,1,-1.421815,6,-0.748854,-0.247451,-0.25073,0,1,4
12661,9,2,1,0,-0.403682,1,1,0,-1.421815,6,0.688245,0.405301,-0.25073,0,1,4


## Splitting Dataset for Predicting Missing Value

In [46]:
x1 = df_mv.drop(columns='contact')

y1 = df_mv['contact']

In [47]:
xtrain1,xtest1,ytrain1,ytest1 =  train_test_split(x1, y1, train_size=.8, random_state=42)

In [48]:
len(xtrain1), len(xtest1), len(ytrain1), len(ytest1)

(24725, 6182, 24725, 6182)

## Modelling

In [49]:
k = round(len(xtrain1) ** .5)
k

157

In [50]:
model_KNN = KNeighborsClassifier(n_neighbors=k)
model_KNN.fit(xtrain1, ytrain1)
print(f'KNN asli  : {model_KNN.score(xtest1, ytest1)}')

KNN asli  : 0.912649627952119


In [51]:
df_tes_miss

Unnamed: 0,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,y,age_group,pdays_group
0,4,1,2,0,0.259354,1,0,unknown,-1.301418,4,0.010368,-0.573827,-0.250730,0,4,4
1,9,2,1,0,-0.435568,1,0,unknown,-1.301418,4,-0.415726,-0.573827,-0.250730,0,2,4
2,2,1,1,0,-0.444443,1,1,unknown,-1.301418,4,-0.706245,-0.573827,-0.250730,0,1,4
5,4,1,2,0,-0.369166,1,0,unknown,-1.301418,4,-0.462209,-0.573827,-0.250730,0,2,4
6,4,2,2,0,-0.298161,1,1,unknown,-1.301418,4,-0.160070,-0.573827,-0.250730,0,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45061,6,2,1,0,-0.106187,0,0,unknown,0.504533,9,-0.973522,-0.573827,-0.250730,0,1,4
45062,5,1,0,0,-0.201188,0,0,unknown,0.504533,9,-0.981269,-0.573827,-0.250730,0,4,4
45122,2,2,2,0,-0.358975,1,1,unknown,1.226913,9,-0.934786,-0.573827,-0.250730,0,2,4
45135,1,1,0,0,-0.019732,0,0,unknown,1.467707,9,-0.725612,-0.573827,-0.250730,0,3,4


## Transfer Data

In [52]:
df_tes_miss.loc[:,'predicted_contact'] = model_KNN.predict(df_tes_miss.drop(columns=['contact']))

In [53]:
df_tes_miss['predicted_contact'].unique()

# Hasil nya 0 semua.

array([0])

In [54]:
transfer = df_tes_miss['predicted_contact'].tolist()

In [55]:
df.loc[df[df['contact']=='unknown'].index, 'contact'] = transfer

In [56]:
# Cellular = 0, dan hasil nya 0 semua.

df['contact'] = df['contact'].apply(lambda x : 'cellular' if x == 0 else x)

In [57]:
# Sudah tidak ada unknown value.

df[df['contact']=='unknown']

Unnamed: 0,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,y,age_group,pdays_group


## Transfer File

In [58]:
df.to_csv('Bank_Clean.csv', index=False)