# Importing Library

In [107]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Mengubah parameter default matplotlib
Parameter default matplotlib dapat diubah dengan rcParams sebagai berikut


In [108]:
from matplotlib import rcParams
rcParams['figure.figsize'] = 12, 4
rcParams['lines.linewidth'] = 3
rcParams['xtick.labelsize'] = 'x-large'
rcParams['ytick.labelsize'] = 'x-large'

## Load dataset

In [109]:
df = pd.read_csv('botak_kotor.csv')

In [110]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7917 entries, 0 to 7916
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   umur           7838 non-null   float64
 1   jenis_kelamin  7840 non-null   object 
 2   pekerjaan      7760 non-null   object 
 3   provinsi       7831 non-null   object 
 4   gaji           7843 non-null   float64
 5   is_menikah     7845 non-null   float64
 6   is_keturunan   7828 non-null   float64
 7   berat          7861 non-null   float64
 8   tinggi         7843 non-null   float64
 9   sampo          7858 non-null   object 
 10  is_merokok     7847 non-null   float64
 11  pendidikan     7847 non-null   object 
 12  stress         7853 non-null   float64
 13  botak_prob     7838 non-null   float64
dtypes: float64(9), object(5)
memory usage: 866.0+ KB


# 1. Evaluasi Kualitas Data

## 1.a Missing Value

In [111]:
### checking
df.isna().sum()

umur              79
jenis_kelamin     77
pekerjaan        157
provinsi          86
gaji              74
is_menikah        72
is_keturunan      89
berat             56
tinggi            74
sampo             59
is_merokok        70
pendidikan        70
stress            64
botak_prob        79
dtype: int64

In [112]:
cats = ['jenis_kelamin', 'pekerjaan', 'sampo', 'pendidikan', 'provinsi']
nums = ['umur', 'gaji', 'is_menikah', 'is_keturunan', 'berat', 'tinggi', 'is_merokok', 'stress', 'botak_prob']

In [113]:
df[nums].describe()

Unnamed: 0,umur,gaji,is_menikah,is_keturunan,berat,tinggi,is_merokok,stress,botak_prob
count,7838.0,7843.0,7845.0,7828.0,7861.0,7843.0,7847.0,7853.0,7838.0
mean,39.461725,8853770.0,0.979095,0.20465,55.957306,167.252704,0.497133,5.456259,0.573851
std,9.942286,4499844.0,0.143076,0.403471,9.594065,10.933646,0.500024,2.860623,0.173386
min,-1.0,1500000.0,0.0,0.0,40.0,125.491784,0.0,1.0,0.064633
25%,33.0,5735468.0,1.0,0.0,49.312715,159.758874,0.0,3.0,0.45236
50%,39.0,7850662.0,1.0,0.0,53.839669,167.155287,0.0,5.0,0.567954
75%,46.0,10835890.0,1.0,0.0,60.259654,174.43962,1.0,8.0,0.686782
max,77.0,50000000.0,1.0,1.0,150.0,226.454577,1.0,10.0,1.0


In [114]:
df[cats].describe()

Unnamed: 0,jenis_kelamin,pekerjaan,sampo,pendidikan,provinsi
count,7840,7760,7858,7847,7831
unique,2,4,5,6,34
top,Laki-laki,Pegawai swasta,Deadbuoy,S1,Kupang
freq,5367,3160,1618,4309,261


## Missing Data

In [115]:
df.isna().sum()

umur              79
jenis_kelamin     77
pekerjaan        157
provinsi          86
gaji              74
is_menikah        72
is_keturunan      89
berat             56
tinggi            74
sampo             59
is_merokok        70
pendidikan        70
stress            64
botak_prob        79
dtype: int64

### Drop

In [116]:
df.dropna(subset=['botak_prob'], inplace=True)

### Imputasi Numeric

In [117]:
df['is_menikah'] = df.apply(lambda x: x['umur'] > 30 if np.isnan(x['is_menikah']) else x['is_menikah'], axis=1)
df['is_keturunan'].fillna(0, inplace=True)
df['is_merokok'].fillna(1, inplace=True)

df['umur'].fillna(df['umur'].mean(), inplace=True)
df['gaji'].fillna(df['gaji'].mean(), inplace=True)
df['tinggi'].fillna(df['berat'].mean(), inplace=True)
df['berat'].fillna(df['tinggi'].mean(), inplace=True)
df['stress'].fillna(df['stress'].mean(), inplace=True)

### Imputasi Categorical

In [146]:
df['jenis_kelamin'].fillna(df['jenis_kelamin'].mode()[0], inplace=True)
df['provinsi'].fillna(df['provinsi'].mode()[0], inplace=True)
df['sampo'].fillna(df['sampo'].mode()[0], inplace=True)
df['pendidikan'].fillna(df['pendidikan'].mode()[0], inplace=True)
df['pekerjaan'].fillna(df['pekerjaan'].mode()[0], inplace=True)

In [119]:
df.isna().sum()

umur             0
jenis_kelamin    0
pekerjaan        0
provinsi         0
gaji             0
is_menikah       0
is_keturunan     0
berat            0
tinggi           0
sampo            0
is_merokok       0
pendidikan       0
stress           0
botak_prob       0
dtype: int64

## 1.b Inconsitency Data

In [120]:
# minimum value have negative number
df['umur'].describe()

count    7838.000000
mean       39.472484
std         9.887590
min        -1.000000
25%        33.000000
50%        39.472484
75%        46.000000
max        77.000000
Name: umur, dtype: float64

In [121]:
## change negative number by minimum
df.loc[df['umur'] ==-1,'umur'] = df['umur'].mean()

In [122]:
df['umur'].describe()

count    7838.000000
mean       39.503465
std         9.823920
min         2.000000
25%        33.000000
50%        39.472484
75%        46.000000
max        77.000000
Name: umur, dtype: float64

## 1.c Duplicated Data

In [123]:
df.duplicated(subset=['umur','sampo', 'berat']).sum()

90

In [124]:
print(df.duplicated().sum())
df.drop_duplicates(inplace=True)
print(df.duplicated().sum())

81
0


In [154]:
df = df.reset_index(drop=True)

# 2. Normalization and Standardization

In [155]:
df.describe()

Unnamed: 0,umur,jenis_kelamin,pekerjaan,provinsi,gaji,is_keturunan,berat,tinggi,sampo,is_merokok,pendidikan,stress,botak_prob,berat_norm,tinggi_norm,umur_std,gaji_std
count,7757.0,7680.0,7680.0,7680.0,7757.0,7757.0,7757.0,7757.0,7680.0,7757.0,7680.0,7757.0,7757.0,7757.0,7757.0,7757.0,7757.0
mean,39.499312,0.310417,1.363542,16.266406,8839568.0,0.202527,56.742044,166.202713,1.965495,0.50303,1.552214,5.446902,0.573762,0.132634,0.646626,-5.189223e-16,-2.450448e-16
std,9.826394,0.462694,0.778085,9.701314,4464131.0,0.401909,13.35917,15.240704,1.429229,0.500023,1.939207,2.853937,0.173528,0.105834,0.089386,1.000064,1.000064
min,2.0,0.0,0.0,0.0,1500000.0,0.0,40.0,55.950123,0.0,0.0,0.0,1.0,0.064633,0.0,0.0,-3.816428,-1.644226
25%,33.0,0.0,1.0,8.0,5751593.0,0.0,49.331837,159.518068,1.0,0.0,0.0,3.0,0.451812,0.073929,0.607421,-0.6614563,-0.6917751
50%,39.472484,0.0,1.0,16.0,7886925.0,0.0,53.914947,167.003566,2.0,1.0,0.0,5.0,0.567963,0.110237,0.651323,-0.00273039,-0.2134132
75%,46.0,1.0,2.0,25.0,10775210.0,0.0,60.50164,174.358056,3.0,1.0,4.0,8.0,0.686788,0.162419,0.694457,0.6615964,0.4336262
max,77.0,1.0,3.0,33.0,50000000.0,1.0,166.227238,226.454577,4.0,1.0,5.0,10.0,1.0,1.0,1.0,3.816568,9.220851


## 2a. Normalization

In [156]:
from sklearn.preprocessing import MinMaxScaler

In [157]:
df['berat_norm'] = MinMaxScaler().fit_transform(df['berat'].values.reshape(len(df), 1))
df['tinggi_norm'] = MinMaxScaler().fit_transform(df['tinggi'].values.reshape(len(df), 1))

In [158]:
df.describe()

Unnamed: 0,umur,jenis_kelamin,pekerjaan,provinsi,gaji,is_keturunan,berat,tinggi,sampo,is_merokok,pendidikan,stress,botak_prob,berat_norm,tinggi_norm,umur_std,gaji_std
count,7757.0,7680.0,7680.0,7680.0,7757.0,7757.0,7757.0,7757.0,7680.0,7757.0,7680.0,7757.0,7757.0,7757.0,7757.0,7757.0,7757.0
mean,39.499312,0.310417,1.363542,16.266406,8839568.0,0.202527,56.742044,166.202713,1.965495,0.50303,1.552214,5.446902,0.573762,0.132634,0.646626,-5.189223e-16,-2.450448e-16
std,9.826394,0.462694,0.778085,9.701314,4464131.0,0.401909,13.35917,15.240704,1.429229,0.500023,1.939207,2.853937,0.173528,0.105834,0.089386,1.000064,1.000064
min,2.0,0.0,0.0,0.0,1500000.0,0.0,40.0,55.950123,0.0,0.0,0.0,1.0,0.064633,0.0,0.0,-3.816428,-1.644226
25%,33.0,0.0,1.0,8.0,5751593.0,0.0,49.331837,159.518068,1.0,0.0,0.0,3.0,0.451812,0.073929,0.607421,-0.6614563,-0.6917751
50%,39.472484,0.0,1.0,16.0,7886925.0,0.0,53.914947,167.003566,2.0,1.0,0.0,5.0,0.567963,0.110237,0.651323,-0.00273039,-0.2134132
75%,46.0,1.0,2.0,25.0,10775210.0,0.0,60.50164,174.358056,3.0,1.0,4.0,8.0,0.686788,0.162419,0.694457,0.6615964,0.4336262
max,77.0,1.0,3.0,33.0,50000000.0,1.0,166.227238,226.454577,4.0,1.0,5.0,10.0,1.0,1.0,1.0,3.816568,9.220851


## 2b. Standardization

In [159]:
from sklearn.preprocessing import StandardScaler

In [160]:
df['umur_std'] = StandardScaler().fit_transform(df['umur'].values.reshape(len(df), 1))
df['gaji_std'] = StandardScaler().fit_transform(df['gaji'].values.reshape(len(df), 1))

In [161]:
df.describe()

Unnamed: 0,umur,jenis_kelamin,pekerjaan,provinsi,gaji,is_keturunan,berat,tinggi,sampo,is_merokok,pendidikan,stress,botak_prob,berat_norm,tinggi_norm,umur_std,gaji_std
count,7757.0,7680.0,7680.0,7680.0,7757.0,7757.0,7757.0,7757.0,7680.0,7757.0,7680.0,7757.0,7757.0,7757.0,7757.0,7757.0,7757.0
mean,39.499312,0.310417,1.363542,16.266406,8839568.0,0.202527,56.742044,166.202713,1.965495,0.50303,1.552214,5.446902,0.573762,0.132634,0.646626,-5.189223e-16,-2.450448e-16
std,9.826394,0.462694,0.778085,9.701314,4464131.0,0.401909,13.35917,15.240704,1.429229,0.500023,1.939207,2.853937,0.173528,0.105834,0.089386,1.000064,1.000064
min,2.0,0.0,0.0,0.0,1500000.0,0.0,40.0,55.950123,0.0,0.0,0.0,1.0,0.064633,0.0,0.0,-3.816428,-1.644226
25%,33.0,0.0,1.0,8.0,5751593.0,0.0,49.331837,159.518068,1.0,0.0,0.0,3.0,0.451812,0.073929,0.607421,-0.6614563,-0.6917751
50%,39.472484,0.0,1.0,16.0,7886925.0,0.0,53.914947,167.003566,2.0,1.0,0.0,5.0,0.567963,0.110237,0.651323,-0.00273039,-0.2134132
75%,46.0,1.0,2.0,25.0,10775210.0,0.0,60.50164,174.358056,3.0,1.0,4.0,8.0,0.686788,0.162419,0.694457,0.6615964,0.4336262
max,77.0,1.0,3.0,33.0,50000000.0,1.0,166.227238,226.454577,4.0,1.0,5.0,10.0,1.0,1.0,1.0,3.816568,9.220851


# 3. Feature Encoding

## 3.a One Hot Encoding

In [162]:
for cat in cats:
    onehots = pd.get_dummies(df[cat], prefix=cat)
    df_onehot = df.join(onehots)

In [163]:
df.sample(5)

Unnamed: 0,umur,jenis_kelamin,pekerjaan,provinsi,gaji,is_menikah,is_keturunan,berat,tinggi,sampo,is_merokok,pendidikan,stress,botak_prob,berat_norm,tinggi_norm,umur_std,gaji_std
6631,38.0,1.0,2.0,0.0,6735476.0,1,0.0,61.63423,180.256574,3.0,1.0,0.0,6.0,0.543912,0.171391,0.729051,-0.15259,-0.471363
1927,42.0,0.0,2.0,30.0,4697662.0,1,0.0,61.998865,164.950347,0.0,0.0,1.0,2.0,0.519725,0.17428,0.639281,0.254503,-0.927879
7501,45.0,0.0,1.0,21.0,11046090.0,1,1.0,57.369429,163.507327,3.0,0.0,1.0,1.0,1.0,0.137604,0.630818,0.559823,0.494309
3918,33.0,0.0,1.0,0.0,8126573.0,1,0.0,49.092423,159.047866,1.0,1.0,0.0,6.0,0.599375,0.072032,0.604663,-0.661456,-0.159727
2771,33.0,0.0,0.0,3.0,3485606.0,1,1.0,48.494338,168.520657,2.0,1.0,4.0,7.0,0.826226,0.067294,0.66022,-0.661456,-1.199407


In [164]:
df_onehot

Unnamed: 0,umur,jenis_kelamin,pekerjaan,provinsi,gaji,is_menikah,is_keturunan,berat,tinggi,sampo,...,provinsi_24.0,provinsi_25.0,provinsi_26.0,provinsi_27.0,provinsi_28.0,provinsi_29.0,provinsi_30.0,provinsi_31.0,provinsi_32.0,provinsi_33.0
0,27.0,1.0,1.0,5.0,7.957453e+06,1,0.0,54.315053,170.428542,3.0,...,0,0,0,0,0,0,0,0,0,0
1,53.0,1.0,1.0,3.0,7.633003e+06,1,0.0,72.873404,165.530097,3.0,...,0,0,0,0,0,0,0,0,0,0
2,37.0,1.0,2.0,3.0,6.637625e+06,1,0.0,46.321533,154.599388,2.0,...,0,0,0,0,0,0,0,0,0,0
3,36.0,1.0,3.0,22.0,3.624871e+06,1,0.0,51.539781,167.340481,0.0,...,0,0,0,0,0,0,0,0,0,0
4,38.0,0.0,0.0,20.0,6.031808e+06,1,0.0,60.726909,165.514773,1.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7752,52.0,,,,4.286318e+06,1,0.0,46.893444,169.880171,,...,0,0,0,0,0,0,0,0,0,0
7753,45.0,,,,8.646146e+06,1,0.0,53.878714,170.122857,,...,0,0,0,0,0,0,0,0,0,0
7754,24.0,,,,6.850066e+06,1,0.0,64.171294,161.384804,,...,0,0,0,0,0,0,0,0,0,0
7755,29.0,,,,1.204960e+07,1,0.0,55.217752,179.602130,,...,0,0,0,0,0,0,0,0,0,0


## 3b. Label Encoding

In [165]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()

In [166]:
ordinals = pd.DataFrame(encoder.fit_transform(df[cats]),columns = cats)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [167]:
ordinals

Unnamed: 0,jenis_kelamin,pekerjaan,sampo,pendidikan,provinsi
0,1.0,1.0,3.0,0.0,5.0
1,1.0,1.0,3.0,0.0,3.0
2,1.0,2.0,2.0,0.0,3.0
3,1.0,3.0,0.0,3.0,22.0
4,0.0,0.0,1.0,1.0,20.0
...,...,...,...,...,...
7752,0.0,2.0,0.0,0.0,12.0
7753,0.0,2.0,0.0,0.0,12.0
7754,0.0,2.0,0.0,0.0,12.0
7755,0.0,2.0,0.0,0.0,12.0


In [150]:
df[cats]=ordinals

In [151]:
df

Unnamed: 0,umur,jenis_kelamin,pekerjaan,provinsi,gaji,is_menikah,is_keturunan,berat,tinggi,sampo,is_merokok,pendidikan,stress,botak_prob,berat_norm,tinggi_norm,umur_std,gaji_std
0,27.0,1.0,1.0,5.0,7.957453e+06,1,0.0,54.315053,170.428542,3.0,1.0,0.0,5.0,0.605974,0.113407,0.671410,-1.272096,-0.197613
1,53.0,1.0,1.0,3.0,7.633003e+06,1,0.0,72.873404,165.530097,3.0,0.0,0.0,7.0,0.532860,0.260430,0.642681,1.374009,-0.270297
2,37.0,1.0,2.0,3.0,6.637625e+06,1,0.0,46.321533,154.599388,2.0,0.0,0.0,4.0,0.418442,0.050081,0.578573,-0.254363,-0.493284
3,36.0,1.0,3.0,22.0,3.624871e+06,1,0.0,51.539781,167.340481,0.0,1.0,3.0,9.0,0.804050,0.091421,0.653299,-0.356136,-1.168208
4,38.0,0.0,0.0,20.0,6.031808e+06,1,0.0,60.726909,165.514773,1.0,1.0,1.0,1.0,0.368371,0.164203,0.642591,-0.152590,-0.629001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7830,52.0,,,,4.286318e+06,1,0.0,46.893444,169.880171,,1.0,,2.0,0.773141,0.054611,0.668194,1.272236,-1.020029
7831,45.0,,,,8.646146e+06,1,0.0,53.878714,170.122857,,0.0,,2.0,0.604058,0.109950,0.669617,0.559823,-0.043331
7832,24.0,,,,6.850066e+06,1,0.0,64.171294,161.384804,,0.0,,8.0,0.502413,0.191490,0.618369,-1.577416,-0.445693
7833,29.0,,,,1.204960e+07,1,0.0,55.217752,179.602130,,1.0,,6.0,0.562664,0.120558,0.725213,-1.068549,0.719119


## Imbalanced Dataset

In [142]:
df['botak_class'] = df['botak_prob'] > 0.8
df['botak_class'].value_counts()

False    6495
True      756
Name: botak_class, dtype: int64

In [143]:
X = df[[col for col in df.columns if (str(df[col].dtype) != 'object') and col not in ['botak_prob', 'botak_class']]]
y = df['botak_class'].values
print(X.shape)
print(y.shape)

(7251, 64)
(7251,)


In [150]:
from imblearn import under_sampling, over_sampling
X_under, y_under = under_sampling.RandomUnderSampler(0.5).fit_resample(X, y)
X_over, y_over = over_sampling.RandomOverSampler(0.5).fit_resample(X, y)



In [152]:
pd.Series(y_over).value_counts()

False    6495
True     3247
dtype: int64