# 핸즈온 머신러닝
## 2.5 머신러닝을 위한 데이터 정제
> 데이터 출처 : https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data            

-------
데이터 불러오기

In [1]:
import pandas as pd
sub_mission=pd.read_csv(r"C:\Users\SAMSUNG\Desktop\공부파일들\카글\house-prices-advanced-regression-techniques\sample_submission.csv")
test=pd.read_csv(r"C:\Users\SAMSUNG\Desktop\공부파일들\카글\house-prices-advanced-regression-techniques\test.csv")
train=pd.read_csv(r"C:\Users\SAMSUNG\Desktop\공부파일들\카글\house-prices-advanced-regression-techniques\train.csv")

------

### 2.5.1 데이터 정제

In [2]:
# 데이터 중앙값 
import sklearn.impute as skimp
imputer=skimp.SimpleImputer(strategy="median") # 중앙값으로 대체하는 모듈

In [3]:
# 수치형 데이터 선택
import numpy as np
train_numeric=train.select_dtypes(include=[np.number])

In [4]:
# 아래 3개의 attribute 에만 nan값 존재
pd.DataFrame(train_numeric.isna().sum()).sort_values(by=0,ascending=False).head(4)

Unnamed: 0,0
LotFrontage,259
GarageYrBlt,81
MasVnrArea,8
Id,0


In [5]:
# 이렇게 하면 중앙값으로 nan값을 대체
pre_train_numeric=pd.DataFrame(imputer.fit_transform(X=train_numeric),columns=train_numeric.columns)

### 2.5.2 텍스트와 범주형 데이터 다루기

In [6]:
# pandas.ger_dummies 와 다른 방법 소개
import sklearn.preprocessing as skpre
encoder=skpre.OneHotEncoder()

In [7]:
train.select_dtypes(np.object_).head(3)

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [8]:
# array로 넘겨줘야 trainsform이 작동함
encod_result=encoder.fit_transform(X=train[['MSZoning']])
# 이 때 결과가 희소행렬로 다뤄지기 떄문에 아래의 방식으로 확인해야함
# 대신 메모리 절약 및 계산속도가 좋아짐

In [9]:
pd.DataFrame(encod_result.toarray(),columns=encoder.categories_)

Unnamed: 0,C (all),FV,RH,RL,RM
0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...
1455,0.0,0.0,0.0,1.0,0.0
1456,0.0,0.0,0.0,1.0,0.0
1457,0.0,0.0,0.0,1.0,0.0
1458,0.0,0.0,0.0,1.0,0.0


위의 방법은 `pandas.get_dummies`보다 좋은 점은 `encoder를 기억한다는 점이다`.           
아래처럼 알수없는 카테고리는 무시할 수 있다

In [10]:
encoder=skpre.OneHotEncoder(handle_unknown="ignore")
encoder.fit(train[["MSZoning"]])

In [11]:
# column name
encoder.feature_names_in_

array(['MSZoning'], dtype=object)

In [12]:
# feature names
encoder.get_feature_names_out()

array(['MSZoning_C (all)', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL',
       'MSZoning_RM'], dtype=object)

### 2.5.5 변환 파이프라인
> 변환과정을 순서대로 처리해주는 것을 도와줌

In [14]:
import sklearn.pipeline as skpip

In [15]:
num_pipline=skpip.Pipeline([("impute",skimp.SimpleImputer(strategy="median"))
                            ,("scale",skpre.StandardScaler())])

위와 같이 `Pipline`은 `이름,변환기` 순서쌍을 리스트로 받아 처리한다

`LotFrontage` feature에 적용한 예시를 보자

In [20]:
num_pipline.fit_transform(train[['LotFrontage']])

array([[-0.22087509],
       [ 0.46031974],
       [-0.08463612],
       ...,
       [-0.1754621 ],
       [-0.08463612],
       [ 0.23325479]])

위의 `변환 이름짓기`는 과정을 생략하고 싶으면 `make_pipline`을 이용하자

`columns`별로 적용하고 싶은 `scale`이 다르다면 아래의 방법을 고려해보자

In [25]:
import sklearn.compose as skcom

In [68]:
# step1. columns 구분
num_columns=train.select_dtypes(np.number).columns
cate_columns=train.select_dtypes('object').columns
# step2. 변환기 만들기
encoder=skpip.make_pipeline(skimp.SimpleImputer(strategy="most_frequent")
                                   ,skpre.OneHotEncoder(handle_unknown="ignore"))

scaler=skpip.make_pipeline(skimp.SimpleImputer(strategy='median')
                                  ,skpre.StandardScaler())
# step3. 구분별로 적용
column_scale=skcom.ColumnTransformer(transformers=[("num",scaler,num_columns),
                                                   ("cate",encoder,cate_columns)],remainder="passthrough")
# step4. 변환
pip_result=column_scale.fit_transform(train)

결과보기

In [72]:
pd.DataFrame(pip_result.toarray(),columns=column_scale.get_feature_names_out())

Unnamed: 0,num__Id,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,...,cate__SaleType_ConLw,cate__SaleType_New,cate__SaleType_Oth,cate__SaleType_WD,cate__SaleCondition_Abnorml,cate__SaleCondition_AdjLand,cate__SaleCondition_Alloca,cate__SaleCondition_Family,cate__SaleCondition_Normal,cate__SaleCondition_Partial
0,-1.730865,0.073375,-0.220875,-0.207142,0.651479,-0.517200,1.050994,0.878668,0.514104,0.575425,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-1.728492,-0.872563,0.460320,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.570750,1.171992,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-1.726120,0.073375,-0.084636,0.073480,0.651479,-0.517200,0.984752,0.830215,0.325915,0.092907,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-1.723747,0.309859,-0.447940,-0.096897,0.651479,-0.517200,-1.863632,-0.720298,-0.570750,-0.499274,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,-1.721374,0.073375,0.641972,0.375148,1.374795,-0.517200,0.951632,0.733308,1.366489,0.463568,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1.721374,0.073375,-0.357114,-0.260560,-0.071836,-0.517200,0.918511,0.733308,-0.570750,-0.973018,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1456,1.723747,-0.872563,0.687385,0.266407,-0.071836,0.381743,0.222975,0.151865,0.087911,0.759659,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1457,1.726120,0.309859,-0.175462,-0.147810,0.651479,3.078570,-1.002492,1.024029,-0.570750,-0.369871,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1458,1.728492,-0.872563,-0.084636,-0.080160,-0.795151,0.381743,-0.704406,0.539493,-0.570750,-0.865548,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
