## Regression & Pipeline

### 1. Pipeline과 ColumnTransformer로 워크플로 구축

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split # 데이터 분할
from sklearn.pipeline import Pipeline # 노드를 엣지로 묶어주는 역할
from sklearn.impute import SimpleImputer # impute : SimpleImputer, KNNImputer - 결측치 처리(보관)
from sklearn.preprocessing import StandardScaler, OneHotEncoder # 데이터 정규화
from sklearn.compose import ColumnTransformer # 여러 특성을 한 번에 처리
from sklearn.linear_model import LinearRegression # 선형 회귀 모델
from sklearn.metrics import mean_squared_error # 손실함수

In [2]:
# 1. 데이터 입력
# Ames Housing 데이터셋 활용
from sklearn.datasets import fetch_openml

housing = fetch_openml(name="house_prices", as_frame=True)
df = housing.data.join(housing.target)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

  warn(


In [3]:
# 독립 변수(x)와 종속 변수(y) 분리
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

# train / test 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# 2. 전처리기
# 수치형 데이터 전처리 - 결측치 평균으로 채우고, 표준화 스케일링 적용
numeric_features = ['LotArea', 'OverallQual']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [5]:
# 범주형 데이터 전처리 - 결측치 missing으로 채우고, 원-핫 인코딩 적용
categorical_features = ['BldgType', 'Neighborhood']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [6]:
# 3. ColumnTransformer로 전처리 파이프라인 통합
# 수치형, 범주형이 각각 알아서 가도록 안내
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features) # 묶어주면 나중에 범주형 / 수치형으로 나눠서 처리
    ]
)

In [9]:
# 4. 최종 파이프라인 구축 (전처리기 + 모델)
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [10]:
# 5. 모델 학습 및 예측
model_pipeline.fit(X_train, y_train) # 파이프라인 구축한 걸 적합시켜 한 번에 예측
y_pred = model_pipeline.predict(X_test)

In [12]:
# 6. 평가
rmse = np.sqrt(mean_squared_error(y_test, y_pred)) # rsme 잔차의 제곱근을 구하는 코드
print(f"rmse: ${rmse:,.2f}")

rmse: $42,440.26


In [13]:
# 파이프라인 내부 확인
print(model_pipeline.named_steps['regressor'].coef_)

[ 10437.71138441  43696.47893969  22429.1364199   20638.21840701
  21201.04894159 -40309.46755048 -23958.93621802  12308.15148191
 -11749.25562852 -15379.83215541 -33311.68924116  10862.32739463
  -8211.92435629  10578.89404831 -30551.32259458 -15610.12997536
 -49371.08927998  22846.10688408 -17612.75316139 -24488.98841576
  11381.24376929 -14533.03812268  68148.45703577  63366.74715794
 -37592.98887534 -22534.18179254 -22219.21558285 -11249.25911409
   9409.88026696  69977.12745214  -3474.83414097  39011.56694589]


### 2. 선형 회귀 (Linear Regression)

In [19]:
import pandas as pd
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [20]:
# 1. 데이터 입력
# Ames Housing 데이터셋 활용
from sklearn.datasets import fetch_openml

housing = fetch_openml(name="house_prices", as_frame=True)
df = housing.data.join(housing.target)
df.info()

  warn(


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [22]:
# 독립/종속 변수 선택
X = df[['GrLivArea']] # 2D 형태로 입력
y = df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)