In [1]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('adul_fix_maybe.csv', na_values=['#NAME?'])

In [3]:
df.shape

(32561, 15)

In [4]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capita_gain        int64
capital_loss       int64
hours_per_weel     int64
native_country    object
income            object
dtype: object

In [5]:
print(df.head(5))

   age         workclass  fnlwgt  education  education_num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital_status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capita_gain  capital_loss  hours_per_weel native_country income  
0         2174             0              40  United-States  <=50K  
1            0             0              13

In [6]:
#income (outcome variable) dijadikan sebagai label, karena ingin menghitung transaksi penipuan
df['income'].value_counts()

<=50K    24720
>50K      7841
Name: income, dtype: int64

In [7]:
#Mengetahui Isi Data yang bersifat na
df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capita_gain       0
capital_loss      0
hours_per_weel    0
native_country    0
income            0
dtype: int64

In [8]:
df.isnull().sum().sort_values(ascending = False)

income            0
native_country    0
hours_per_weel    0
capital_loss      0
capita_gain       0
sex               0
race              0
relationship      0
occupation        0
marital_status    0
education_num     0
education         0
fnlwgt            0
workclass         0
age               0
dtype: int64

In [9]:
#Ubah <50K = 0 dan >50K = 1
df['income'] = [0 if x == '<=50K' else 1 for x in df['income']]

#Tetapkan X sebagai DataFrame dan y ada outcome variable
X = df.drop('income', 1)
y = df['income']

In [10]:
print((X).head(5))

   age         workclass  fnlwgt  education  education_num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital_status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capita_gain  capital_loss  hours_per_weel native_country  
0         2174             0              40  United-States  
1            0             0              13  United-State

In [11]:
y.value_counts()

0    24720
1     7841
Name: income, dtype: int64

In [12]:
y.head(5)

0    0
1    0
2    0
3    0
4    0
Name: income, dtype: int64

# Data Cleaning

In [13]:
#Education sebagai kolom kategori
print(X['education'].head(5))

0    Bachelors
1    Bachelors
2      HS-grad
3         11th
4    Bachelors
Name: education, dtype: object


In [14]:
#Gunakan get_dummies dari panda
print(pd.get_dummies(X['education']).head(5))

   10th  11th  12th  1st-4th  5th-6th  7th-8th  9th  Assoc-acdm  Assoc-voc  \
0     0     0     0        0        0        0    0           0          0   
1     0     0     0        0        0        0    0           0          0   
2     0     0     0        0        0        0    0           0          0   
3     0     1     0        0        0        0    0           0          0   
4     0     0     0        0        0        0    0           0          0   

   Bachelors  Doctorate  HS-grad  Masters  Preschool  Prof-school  \
0          1          0        0        0          0            0   
1          1          0        0        0          0            0   
2          0          0        1        0          0            0   
3          0          0        0        0          0            0   
4          1          0        0        0          0            0   

   Some-college  
0             0  
1             0  
2             0  
3             0  
4             0  


In [15]:
#Tentukan kolom yang yang dijadikan kategori
for col_name in X.columns:
    if X[col_name].dtypes == 'object':
        unique_cat = len(X[col_name].unique())
        print("Kolom '{col_name}' has '{unique_cat}' Uniques Categories".format(col_name=col_name, unique_cat = unique_cat))

Kolom 'workclass' has '9' Uniques Categories
Kolom 'education' has '16' Uniques Categories
Kolom 'marital_status' has '7' Uniques Categories
Kolom 'occupation' has '15' Uniques Categories
Kolom 'relationship' has '6' Uniques Categories
Kolom 'race' has '5' Uniques Categories
Kolom 'sex' has '2' Uniques Categories
Kolom 'native_country' has '42' Uniques Categories


In [16]:
#Membuat list bagi kolom
todummy_list = ['workclass', 'education','marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

In [17]:
print(X['native_country'].value_counts().sort_values(ascending=False).head(5))

United-States    29170
Mexico             643
?                  583
Philippines        198
Germany            137
Name: native_country, dtype: int64


In [18]:
X['native_country'] = ['United-States' if x == 'United-States' else 'Other' for x in X['native_country']]

print(X['native_country'].value_counts().sort_values(ascending = False))

United-States    29170
Other             3391
Name: native_country, dtype: int64


In [19]:
X['native_country']

0        United-States
1        United-States
2        United-States
3        United-States
4                Other
             ...      
32556    United-States
32557    United-States
32558    United-States
32559    United-States
32560    United-States
Name: native_country, Length: 32561, dtype: object

In [20]:
print(pd.get_dummies(X['native_country']))

       Other  United-States
0          0              1
1          0              1
2          0              1
3          0              1
4          1              0
...      ...            ...
32556      0              1
32557      0              1
32558      0              1
32559      0              1
32560      0              1

[32561 rows x 2 columns]


In [21]:
# Fungsi dummy untuk semua kategori varibel yang digunakan untuk model
def dummy_df(df, todummy_list):
    for x in todummy_list:
        dummies = pd.get_dummies(df[x], prefix=x, dummy_na=False)
        df = df.drop(x,1)
        df = pd.concat([df, dummies], axis=1)
    return df

In [22]:
X = dummy_df(X, todummy_list)
print(X.head(5))

   age  fnlwgt  education_num  capita_gain  capital_loss  hours_per_weel  \
0   39   77516             13         2174             0              40   
1   50   83311             13            0             0              13   
2   38  215646              9            0             0              40   
3   53  234721              7            0             0              40   
4   28  338409             13            0             0              40   

   workclass_?  workclass_Federal-gov  workclass_Local-gov  \
0            0                      0                    0   
1            0                      0                    0   
2            0                      0                    0   
3            0                      0                    0   
4            0                      0                    0   

   workclass_Never-worked  ...  relationship_Wife  race_Amer-Indian-Eskimo  \
0                       0  ...                  0                        0   
1             

# Fixing/Handling Missing Values

In [23]:
X.isnull().sum().sort_values(ascending = False).head(5)
#Karena tidak ada data yang bersifat null maka proses dapat dilanjutkan ke tahap berikutnya

native_country_United-States    0
education_Bachelors             0
education_1st-4th               0
education_5th-6th               0
education_7th-8th               0
dtype: int64

### Tukey IQR

In [38]:
def find_outliers_tukey(x):
    q1 = np.percentile(x, 25)
    q3 = np.percentile(x, 75)
    
    iqr = q3 - q1 #Rumus IQR
    
    floor = q1 - 1.5*iqr #Membuat batas bawah dari IQR dibawah Q1
    ceiling = q3 + 1.5*iqr #Membuat batas atas dari IQR diatas Q3
    
    outliers_indices = list(x.index[(x < floor) | (x > ceiling)]) #Membuat aturan dari outlier dari rumus IQR
    outliers_values = list(x[outliers_indices])
    
    return outliers_indices, outliers_values

In [39]:
tukey_indices, tukey_values = find_outliers_tukey(X['age'])
print(np.sort(tukey_values))

[79 79 79 79 79 79 79 79 79 79 79 79 79 79 79 79 79 79 79 79 79 79 80 80
 80 80 80 80 80 80 80 80 80 80 80 80 80 80 80 80 80 80 80 80 81 81 81 81
 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 82 82 82 82 82 82 82 82
 82 82 82 82 83 83 83 83 83 83 84 84 84 84 84 84 84 84 84 84 85 85 85 86
 87 88 88 88 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90
 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90]


In [40]:
from sklearn.preprocessing import scale
from statsmodels.nonparametric.kde import KDEUnivariate