In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


from feature_engine.imputation import CategoricalImputer
from feature_engine.outliers import Winsorizer
from feature_engine.encoding import (
    CountFrequencyEncoder
)

In [2]:
url = 'dataset/train.csv'
train = pd.read_csv(url)
test = pd.read_csv('dataset/test.csv')
train.head()

Unnamed: 0,ID,Tahun Kelahiran,Kelas Pekerjaan,fnlwgt,Pendidikan,Jenjang Pendidikan,Status,Pekerjaan,Hubungan,Etnis,sex,pendapatan,pengeluaran,hours per week,Asal Negara,jumlah_anak,income
0,478,1992,Swasta,37210,Sarjana,Sarjana,Menikah,Eksekutif Manager,Suami,1,Male,0,0,45,United-States,2,>50K
1,479,1981,Swasta,101950,Magister,Magister,Belum-menikah,Eksekutif Manager,Tidak dalam Keluarga,1,Female,0,0,45,United-States,0,<=50K
2,480,2004,,122244,SMA Grad,SMA Grad,Belum-menikah,,Tidak dalam Keluarga,1,Female,0,0,28,United-States,0,<=50K
3,481,1984,Pemerintah daerah,24763,Diploma,Kuliah,Cerai,Transportasi dan Pengangkutan,Belum Menikah,1,Male,6849,0,40,United-States,0,<=50K
4,482,2000,Swasta,113936,Sarjana,Sarjana,Belum-menikah,Spesialis Profesional,Punya Anak,1,Male,0,0,40,United-States,0,<=50K


## Data Preprocessing

In [3]:
# Dropping unnecessary column
train.drop('ID', axis=1, inplace=True)
train.head(2)

Unnamed: 0,Tahun Kelahiran,Kelas Pekerjaan,fnlwgt,Pendidikan,Jenjang Pendidikan,Status,Pekerjaan,Hubungan,Etnis,sex,pendapatan,pengeluaran,hours per week,Asal Negara,jumlah_anak,income
0,1992,Swasta,37210,Sarjana,Sarjana,Menikah,Eksekutif Manager,Suami,1,Male,0,0,45,United-States,2,>50K
1,1981,Swasta,101950,Magister,Magister,Belum-menikah,Eksekutif Manager,Tidak dalam Keluarga,1,Female,0,0,45,United-States,0,<=50K


In [4]:
# function to check missing & duplicated values
def check_data(df):
    print('-' *50)
    
    missing_data = df.isnull().sum().sum()
    
    print(f'There are {missing_data} missing values in the dataset.')
    print(f'There are {df.duplicated().sum().sum()} duplicated values in the dataset.')
    print('-' *50)
    
    if missing_data > 0:
        print('Missing values:')
        print(df.isnull().sum().sort_values(ascending=False))
        print('-' *50)


In [5]:
check_data(train)

--------------------------------------------------
There are 2956 missing values in the dataset.
There are 35 duplicated values in the dataset.
--------------------------------------------------
Missing values:
Pekerjaan             1481
Kelas Pekerjaan       1475
Tahun Kelahiran          0
fnlwgt                   0
Pendidikan               0
Jenjang Pendidikan       0
Status                   0
Hubungan                 0
Etnis                    0
sex                      0
pendapatan               0
pengeluaran              0
hours per week           0
Asal Negara              0
jumlah_anak              0
income                   0
dtype: int64
--------------------------------------------------


## Handling Missing Values

In [6]:
train.Pekerjaan.nunique()

14

In [7]:
train['Kelas Pekerjaan'].nunique()

8

In [8]:
categorical_imputer = CategoricalImputer(
    imputation_method="frequent",
    variables=['Kelas Pekerjaan', 'Pekerjaan'],
)

train = categorical_imputer.fit_transform(train)
test = categorical_imputer.transform(test)

KeyError: "['income'] not in index"

In [35]:
check_data(train)

--------------------------------------------------
There are 2956 missing values in the dataset.
There are 0 duplicated values in the dataset.
--------------------------------------------------
Missing values:
Pekerjaan             1481
Kelas Pekerjaan       1475
ID                       0
sex                      0
jumlah_anak              0
Asal Negara              0
hours per week           0
pengeluaran              0
pendapatan               0
Hubungan                 0
Etnis                    0
Tahun Kelahiran          0
Status                   0
Jenjang Pendidikan       0
Pendidikan               0
fnlwgt                   0
income                   0
dtype: int64
--------------------------------------------------


## Feature Engineering