### Pipeline과 ColumnTransformer로 워크플로 구축

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split # 데이터 분할
from sklearn.pipeline import Pipeline # 노드를 엣지로 묶어주는 역할
from sklearn.impute import SimpleImputer # impute : SimpleImputer, KNNImputer - 결측치 처리(보관)
from sklearn.preprocessing import StandardScaler, OneHotEncoder # 데이터 정규화
from sklearn.compose import ColumnTransformer # 여러 특성을 한 번에 처리
from sklearn.linear_model import LinearRegression # 선형 회귀 모델
from sklearn.metrics import mean_squared_error # 손실함수

In [3]:
# 1. 데이터 입력
# Ames Housing 데이터셋 활용
from sklearn.datasets import fetch_openml

housing = fetch_openml(name="house_prices", as_frame=True)
df = housing.data.join(housing.target)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
# 독립 변수(x)와 종속 변수(y) 분리
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

# train / test 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 2. 전처리기
# 수치형 데이터 전처리 - 결측치 평균으로 채우고, 표준화 스케일링 적용
numeric_features = ['LotArea', 'OverallQual']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [6]:
# 범주형 데이터 전처리 - 결측치 missing으로 채우고, 원-핫 인코딩 적용
categorical_features = ['BldgType', 'Neighborhood']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])