# Data Preprocessing

## Load Test Data

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder, LabelEncoder

# 데이타 파일 로드 
path = "../Resources/data/DataPreprocess.csv"
df1 = pd.read_csv(path)
df1

df1.shape  # 10개의 관측치, 4개 변수 

# 데이타와 레이블 나누기 - 종속변수(반응변수)와 독립변수(설명변수) 나누기
x = df1.values[:, :-1]   # 데이타  
y = df1.values[:, -1]    # 레이블(정답)
x, y 

(array([['France', 44.0, 72000.0],
        ['Spain', 27.0, 48000.0],
        ['Germany', 30.0, 54000.0],
        ['Spain', 38.0, 61000.0],
        ['Germany', 40.0, nan],
        ['France', 35.0, 58000.0],
        ['Spain', nan, 52000.0],
        ['France', 48.0, 79000.0],
        ['Germany', 50.0, 83000.0],
        ['France', 37.0, 67000.0]], dtype=object),
 array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
       dtype=object))

## Scikit-Learn Handle Nulls

In [12]:
from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])
x



array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Feature Scaling

In [13]:
from sklearn.preprocessing import StandardScaler # 표준화 지원 클래스 

sc_x = StandardScaler()
sc_x.fit_transform(x[:, 1:3])
x[:, 1:3] = sc_x.transform(x[:, 1:3])
x

array([['France', 0.758874361590019, 0.7494732544921677],
       ['Spain', -1.7115038793306814, -1.4381784072687531],
       ['Germany', -1.2755547779917342, -0.8912654918285229],
       ['Spain', -0.1130238410878753, -0.253200423814921],
       ['Germany', 0.17760889313808945, 6.632191985654332e-16],
       ['France', -0.5489729424268225, -0.5266568815350361],
       ['Spain', 0.0, -1.0735697969752662],
       ['France', 1.3401398300419485, 1.3875383225057696],
       ['Germany', 1.6307725642679132, 1.7521469327992565],
       ['France', -0.2583402082008577, 0.29371249162530916]], dtype=object)

## Labeling

In [14]:
from sklearn.preprocessing import  LabelEncoder

le = LabelEncoder()
le.fit(x[:, 0])
new_x = le.transform(x[:, 0])
new_x 

x[:, 0] = new_x

pd.DataFrame(x)

Unnamed: 0,0,1,2
0,0,0.758874,0.749473
1,2,-1.7115,-1.43818
2,1,-1.27555,-0.891265
3,2,-0.113024,-0.2532
4,1,0.177609,6.63219e-16
5,0,-0.548973,-0.526657
6,2,0.0,-1.07357
7,0,1.34014,1.38754
8,1,1.63077,1.75215
9,0,-0.25834,0.293712


## One-hot Encoding

In [15]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categorical_features=[0])
x = ohe.fit_transform(x).toarray()

pd.DataFrame(x)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Unnamed: 0,0,1,2,3,4
0,1.0,0.0,0.0,0.758874,0.7494733
1,0.0,0.0,1.0,-1.711504,-1.438178
2,0.0,1.0,0.0,-1.275555,-0.8912655
3,0.0,0.0,1.0,-0.113024,-0.2532004
4,0.0,1.0,0.0,0.177609,6.632192e-16
5,1.0,0.0,0.0,-0.548973,-0.5266569
6,0.0,0.0,1.0,0.0,-1.07357
7,1.0,0.0,0.0,1.34014,1.387538
8,0.0,1.0,0.0,1.630773,1.752147
9,1.0,0.0,0.0,-0.25834,0.2937125
