# 🚢 0. 라이브러리 import

In [None]:
# 경고문 무시
import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd
import numpy as np
import matplotlib as mlp
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 한글폰트 깨짐 방지
# 이 부분은 실행 툴 따라서 변경해주세요! 현재 주피터 기준입니다
# 윈도우에서는 'Malgun Gothic', 맥에서는 'AppleGothic'
mlp.rcParams['font.family'] ='Malgun Gothic'
mlp.rcParams['axes.unicode_minus'] =False

# 🧐 1. 데이터 확인

In [None]:
# 데이터 가져오기
data = pd.read_csv('boarding_final_data.csv')

# 데이터 확인
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804689 entries, 0 to 804688
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   요금           804689 non-null  int64  
 1   승차거리         804689 non-null  float64
 2   대기시간         804689 non-null  float64
 3   공휴일          804689 non-null  int64  
 4   평균기온(°C)     804689 non-null  float64
 5   일강수량(mm)     804689 non-null  float64
 6   안개 계속시간(hr)  804689 non-null  float64
 7   차고지          804689 non-null  int64  
 8   접수시간         804689 non-null  int64  
 9   출발지구_출발지동    804689 non-null  object 
 10  목적지구_목적지동    804689 non-null  object 
dtypes: float64(5), int64(4), object(2)
memory usage: 67.5+ MB


In [None]:
data

Unnamed: 0,요금,승차거리,대기시간,공휴일,평균기온(°C),일강수량(mm),안개 계속시간(hr),차고지,접수시간,출발지구_출발지동,목적지구_목적지동
0,1500,2967.0,32.950000,1,-4.2,0.0,0.00,0,4,노원구_중계4동,노원구_하계1동
1,1500,3808.0,123.600000,1,-4.2,0.0,0.00,0,2,서대문구_홍제제1동,서대문구_충현동
2,1700,5390.0,122.900000,1,-4.2,0.0,0.00,1,3,노원구_월계2동,성북구_안암동
3,1500,1106.0,30.366667,1,-4.2,0.0,0.00,1,6,노원구_하계1동,노원구_하계1동
4,1500,4702.0,147.833333,1,-4.2,0.0,0.00,0,4,서대문구_남가좌제2동,마포구_망원제1동
...,...,...,...,...,...,...,...,...,...,...,...
804684,2000,6885.0,62.583333,0,2.1,4.7,1.18,1,20,강서구_방화제2동,양천구_신월2동
804685,1500,1064.0,69.716667,0,2.1,4.7,1.18,0,20,은평구_불광제1동,은평구_대조동
804686,1500,1610.0,47.983333,0,2.1,4.7,1.18,0,21,은평구_대조동,은평구_응암제1동
804687,1500,1745.0,72.166667,0,2.1,4.7,1.18,1,22,노원구_하계1동,노원구_중계1동


# 🤓 2. 데이터 전처리

## 1️⃣ 2-1 범주형 변수 인코딩


### 2-1-1 원핫 인코딩(One-Hot encoding)

In [None]:
# 원핫인코딩 데이터에서 불리언 타입만 변환
onehot_data = pd.get_dummies(data, columns=['출발지구_출발지동', '목적지구_목적지동'])
boolean_columns = onehot_data.select_dtypes(include=['bool']).columns

# 불리언 열만 정수로 변환
onehot_data[boolean_columns] = onehot_data[boolean_columns].astype(int)

# 확인
onehot_data

Unnamed: 0,요금,승차거리,대기시간,공휴일,평균기온(°C),일강수량(mm),안개 계속시간(hr),차고지,접수시간,출발지구_출발지동_강동구_강일동,...,목적지구_목적지동_중랑구_면목제5동,목적지구_목적지동_중랑구_면목제7동,목적지구_목적지동_중랑구_묵제1동,목적지구_목적지동_중랑구_묵제2동,목적지구_목적지동_중랑구_상봉제1동,목적지구_목적지동_중랑구_상봉제2동,목적지구_목적지동_중랑구_신내1동,목적지구_목적지동_중랑구_신내2동,목적지구_목적지동_중랑구_중화제1동,목적지구_목적지동_중랑구_중화제2동
0,1500,2967.0,32.950000,1,-4.2,0.0,0.00,0,4,0,...,0,0,0,0,0,0,0,0,0,0
1,1500,3808.0,123.600000,1,-4.2,0.0,0.00,0,2,0,...,0,0,0,0,0,0,0,0,0,0
2,1700,5390.0,122.900000,1,-4.2,0.0,0.00,1,3,0,...,0,0,0,0,0,0,0,0,0,0
3,1500,1106.0,30.366667,1,-4.2,0.0,0.00,1,6,0,...,0,0,0,0,0,0,0,0,0,0
4,1500,4702.0,147.833333,1,-4.2,0.0,0.00,0,4,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
804684,2000,6885.0,62.583333,0,2.1,4.7,1.18,1,20,0,...,0,0,0,0,0,0,0,0,0,0
804685,1500,1064.0,69.716667,0,2.1,4.7,1.18,0,20,0,...,0,0,0,0,0,0,0,0,0,0
804686,1500,1610.0,47.983333,0,2.1,4.7,1.18,0,21,0,...,0,0,0,0,0,0,0,0,0,0
804687,1500,1745.0,72.166667,0,2.1,4.7,1.18,1,22,0,...,0,0,0,0,0,0,0,0,0,0


### 2-1-2 레이블 인코딩(Label encoding)

- 트리 계열의 모델은 숫자의 ordinal 특성을 반영하지 않음 => 레이블 인코딩을 사용

In [None]:
from sklearn.preprocessing import LabelEncoder

# 데이터 복사(원본데이터 지키기)
label_data = data.copy()

# 변환할 범주형 열 리스트
categorical_columns = ['출발지구_출발지동', '목적지구_목적지동']

# 레이블 인코더 객체 생성
label_encoder = LabelEncoder()

# 각 열에 대해 레이블 인코딩 수행
for col in categorical_columns:
    label_data[col] = label_encoder.fit_transform(label_data[col])

# 확인
label_data

Unnamed: 0,요금,승차거리,대기시간,공휴일,평균기온(°C),일강수량(mm),안개 계속시간(hr),차고지,접수시간,출발지구_출발지동,목적지구_목적지동
0,1500,2967.0,32.950000,1,-4.2,0.0,0.00,0,4,53,123
1,1500,3808.0,123.600000,1,-4.2,0.0,0.00,0,2,68,193
2,1700,5390.0,122.900000,1,-4.2,0.0,0.00,1,3,49,227
3,1500,1106.0,30.366667,1,-4.2,0.0,0.00,1,6,55,123
4,1500,4702.0,147.833333,1,-4.2,0.0,0.00,0,4,58,172
...,...,...,...,...,...,...,...,...,...,...,...
804684,2000,6885.0,62.583333,0,2.1,4.7,1.18,1,20,27,270
804685,1500,1064.0,69.716667,0,2.1,4.7,1.18,0,20,76,318
804686,1500,1610.0,47.983333,0,2.1,4.7,1.18,0,21,75,325
804687,1500,1745.0,72.166667,0,2.1,4.7,1.18,1,22,55,119


## 2️⃣ 2-2 입력 변수(X)와 타겟 변수(y) 분리

### 2-2-1 인코딩 미적용 데이터로 X, y 분리

In [None]:
# 입력 변수(X)와 타겟 변수(y) 분리
X = data.drop(columns=['대기시간'])
y = data['대기시간']

### 2-2-2 원핫 인코딩 적용 데이터로 X, y 분리

In [None]:
# 입력 변수(X)와 타겟 변수(y) 분리
X_oh = onehot_data.drop(columns=['대기시간'])
y_oh = onehot_data['대기시간']

### 2-2-3 레이블 인코딩 적용 데이터로 X, y 분리

In [None]:
# 입력 변수(X)와 타겟 변수(y) 분리
X_lb = label_data.drop(columns=['대기시간'])
y_lb = label_data['대기시간']

## 3️⃣ 2-3 훈련셋, 검증셋, 테스트셋 분리

- 훈련셋 60 : 검증셋 20 : 테스트셋 20

### 2-3-1 인코딩 미적용 데이터로 훈련셋, 검증셋, 테스트셋 분리

LightGBM, CatBoost에서 사용

In [None]:
from sklearn.model_selection import train_test_split

# 훈련셋(80)과 테스트셋(20) 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 훈련셋에서 훈련셋(60)과 검증셋(20) 분리
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

### 2-3-2 원핫 인코딩 적용 데이터로 훈련셋, 검증셋, 테스트셋 분리

선형 회귀, 릿지 회귀, 라쏘 회귀, MLP에서 사용

In [None]:
# 훈련셋(80)과 테스트셋(20) 분리
X_train_oh, X_test_oh, y_train_oh, y_test_oh = train_test_split(X_oh, y_oh, test_size=0.2, random_state=42)
# 훈련셋에서 훈련셋(60)과 검증셋(20) 분리
X_train_oh, X_val_oh, y_train_oh, y_val_oh = train_test_split(X_train_oh, y_train_oh, test_size=0.25, random_state=42)

### 2-3-3 레이블 인코딩 적용 데이터로 훈련셋, 검증셋, 테스트셋 분리

랜덤 포레스트, GBM, AdaBoost, XGBoost에서 사용

In [None]:
# 훈련셋(80)과 테스트셋(20) 분리
X_train_lb, X_test_lb, y_train_lb, y_test_lb = train_test_split(X_lb, y_lb, test_size=0.2, random_state=42)
# 훈련셋에서 훈련셋(60)과 검증셋(20) 분리
X_train_lb, X_val_lb, y_train_lb, y_val_lb = train_test_split(X_train_lb, y_train_lb, test_size=0.25, random_state=42)

## 4️⃣ 2-4 스케일링

- 스케일링은 범주형 변수 처리 못 함 => 범주형 변수 인코딩 적용한 상태여야 함
- X에만 적용

### 2-4-1 원핫 인코딩 적용 데이터로 스케일링 3가지

선형 회귀, 릿지 회귀, 라쏘 회귀, MLP에서 사용

In [None]:
# 연속형 변수와  범주형 변수 분리
continuous_features = ['요금', '승차거리', '평균기온(°C)', '일강수량(mm)', '안개 계속시간(hr)', '접수시간']
categorical_features = ['공휴일', '차고지', '출발지구_출발지동_강동구_강일동', '출발지구_출발지동_강동구_고덕제1동', '출발지구_출발지동_강동구_고덕제2동', '출발지구_출발지동_강동구_길동', '출발지구_출발지동_강동구_둔촌제1동', '출발지구_출발지동_강동구_둔촌제2동', '출발지구_출발지동_강동구_명일제1동', '출발지구_출발지동_강동구_명일제2동', '출발지구_출발지동_강동구_상일동', '출발지구_출발지동_강동구_성내제1동', '출발지구_출발지동_강동구_성내제2동', '출발지구_출발지동_강동구_성내제3동', '출발지구_출발지동_강동구_암사제1동', '출발지구_출발지동_강동구_암사제2동', '출발지구_출발지동_강동구_암사제3동', '출발지구_출발지동_강동구_천호제1동', '출발지구_출발지동_강동구_천호제2동', '출발지구_출발지동_강동구_천호제3동', '출발지구_출발지동_강서구_가양제1동', '출발지구_출발지동_강서구_가양제2동', '출발지구_출발지동_강서구_가양제3동', '출발지구_출발지동_강서구_공항동', '출발지구_출발지동_강서구_등촌제1동', '출발지구_출발지동_강서구_등촌제2동', '출발지구_출발지동_강서구_등촌제3동', '출발지구_출발지동_강서구_발산제1동', '출발지구_출발지동_강서구_방화제1동', '출발지구_출발지동_강서구_방화제2동', '출발지구_출발지동_강서구_방화제3동', '출발지구_출발지동_강서구_염창동', '출발지구_출발지동_강서구_우장산동', '출발지구_출발지동_강서구_화곡본동', '출발지구_출발지동_강서구_화곡제1동', '출발지구_출발지동_강서구_화곡제2동', '출발지구_출발지동_강서구_화곡제3동', '출발지구_출발지동_강서구_화곡제4동', '출발지구_출발지동_강서구_화곡제6동', '출발지구_출발지동_강서구_화곡제8동', '출발지구_출발지동_노원구_공릉1.3동', '출발지구_출발지동_노원구_공릉2동', '출발지구_출발지동_노원구_상계10동', '출발지구_출발지동_노원구_상계1동', '출발지구_출발지동_노원구_상계2동', '출발지구_출발지동_노원구_상계3.4동', '출발지구_출발지동_노원구_상계5동', '출발지구_출발지동_노원구_상계6.7동', '출발지구_출발지동_노원구_상계8동', '출발지구_출발지동_노원구_상계9동', '출발지구_출발지동_노원구_월계1동', '출발지구_출발지동_노원구_월계2동', '출발지구_출발지동_노원구_월계3동', '출발지구_출발지동_노원구_중계1동', '출발지구_출발지동_노원구_중계2.3동', '출발지구_출발지동_노원구_중계4동', '출발지구_출발지동_노원구_중계본동', '출발지구_출발지동_노원구_하계1동', '출발지구_출발지동_노원구_하계2동', '출발지구_출발지동_서대문구_남가좌제1동', '출발지구_출발지동_서대문구_남가좌제2동', '출발지구_출발지동_서대문구_북가좌제1동', '출발지구_출발지동_서대문구_북가좌제2동', '출발지구_출발지동_서대문구_북아현동', '출발지구_출발지동_서대문구_신촌동', '출발지구_출발지동_서대문구_연희동', '출발지구_출발지동_서대문구_천연동', '출발지구_출발지동_서대문구_충현동', '출발지구_출발지동_서대문구_홍은제1동', '출발지구_출발지동_서대문구_홍은제2동', '출발지구_출발지동_서대문구_홍제제1동', '출발지구_출발지동_서대문구_홍제제2동', '출발지구_출발지동_서대문구_홍제제3동', '출발지구_출발지동_은평구_갈현제1동', '출발지구_출발지동_은평구_갈현제2동', '출발지구_출발지동_은평구_구산동', '출발지구_출발지동_은평구_녹번동', '출발지구_출발지동_은평구_대조동', '출발지구_출발지동_은평구_불광제1동', '출발지구_출발지동_은평구_불광제2동', '출발지구_출발지동_은평구_수색동', '출발지구_출발지동_은평구_신사제1동', '출발지구_출발지동_은평구_신사제2동', '출발지구_출발지동_은평구_역촌동', '출발지구_출발지동_은평구_응암제1동', '출발지구_출발지동_은평구_응암제2동', '출발지구_출발지동_은평구_응암제3동', '출발지구_출발지동_은평구_증산동', '출발지구_출발지동_은평구_진관동', '목적지구_목적지동_강남구_논현1동', '목적지구_목적지동_강남구_대치1동', '목적지구_목적지동_강남구_대치2동', '목적지구_목적지동_강남구_삼성1동', '목적지구_목적지동_강남구_삼성2동', '목적지구_목적지동_강남구_수서동', '목적지구_목적지동_강남구_신사동', '목적지구_목적지동_강남구_압구정동', '목적지구_목적지동_강남구_역삼1동', '목적지구_목적지동_강남구_일원1동', '목적지구_목적지동_강남구_일원2동', '목적지구_목적지동_강남구_일원본동', '목적지구_목적지동_강동구_강일동', '목적지구_목적지동_강동구_고덕제1동', '목적지구_목적지동_강동구_고덕제2동', '목적지구_목적지동_강동구_길동', '목적지구_목적지동_강동구_둔촌제1동', '목적지구_목적지동_강동구_둔촌제2동', '목적지구_목적지동_강동구_명일제1동', '목적지구_목적지동_강동구_명일제2동', '목적지구_목적지동_강동구_상일동', '목적지구_목적지동_강동구_성내제1동', '목적지구_목적지동_강동구_성내제2동', '목적지구_목적지동_강동구_성내제3동', '목적지구_목적지동_강동구_암사제1동', '목적지구_목적지동_강동구_암사제2동', '목적지구_목적지동_강동구_암사제3동', '목적지구_목적지동_강동구_천호제1동', '목적지구_목적지동_강동구_천호제2동', '목적지구_목적지동_강동구_천호제3동', '목적지구_목적지동_강북구_미아동', '목적지구_목적지동_강북구_번제1동', '목적지구_목적지동_강북구_번제2동', '목적지구_목적지동_강북구_번제3동', '목적지구_목적지동_강북구_삼각산동', '목적지구_목적지동_강북구_삼양동', '목적지구_목적지동_강북구_송중동', '목적지구_목적지동_강북구_송천동', '목적지구_목적지동_강북구_수유제1동', '목적지구_목적지동_강북구_수유제2동', '목적지구_목적지동_강북구_수유제3동', '목적지구_목적지동_강북구_우이동', '목적지구_목적지동_강북구_인수동', '목적지구_목적지동_강서구_가양제1동', '목적지구_목적지동_강서구_가양제2동', '목적지구_목적지동_강서구_가양제3동', '목적지구_목적지동_강서구_공항동', '목적지구_목적지동_강서구_등촌제1동', '목적지구_목적지동_강서구_등촌제2동', '목적지구_목적지동_강서구_등촌제3동', '목적지구_목적지동_강서구_발산제1동', '목적지구_목적지동_강서구_방화제1동', '목적지구_목적지동_강서구_방화제2동', '목적지구_목적지동_강서구_방화제3동', '목적지구_목적지동_강서구_염창동', '목적지구_목적지동_강서구_우장산동', '목적지구_목적지동_강서구_화곡본동', '목적지구_목적지동_강서구_화곡제1동', '목적지구_목적지동_강서구_화곡제2동', '목적지구_목적지동_강서구_화곡제3동', '목적지구_목적지동_강서구_화곡제4동', '목적지구_목적지동_강서구_화곡제6동', '목적지구_목적지동_강서구_화곡제8동', '목적지구_목적지동_관악구_난곡동', '목적지구_목적지동_관악구_미성동', '목적지구_목적지동_관악구_보라매동', '목적지구_목적지동_관악구_삼성동', '목적지구_목적지동_관악구_성현동', '목적지구_목적지동_관악구_은천동', '목적지구_목적지동_관악구_청룡동', '목적지구_목적지동_관악구_행운동', '목적지구_목적지동_광진구_광장동', '목적지구_목적지동_광진구_구의제1동', '목적지구_목적지동_광진구_구의제2동', '목적지구_목적지동_광진구_구의제3동', '목적지구_목적지동_광진구_군자동', '목적지구_목적지동_광진구_능동', '목적지구_목적지동_광진구_자양제1동', '목적지구_목적지동_광진구_자양제2동', '목적지구_목적지동_광진구_자양제3동', '목적지구_목적지동_광진구_자양제4동', '목적지구_목적지동_광진구_중곡제1동', '목적지구_목적지동_광진구_중곡제2동', '목적지구_목적지동_광진구_중곡제3동', '목적지구_목적지동_광진구_중곡제4동', '목적지구_목적지동_광진구_화양동', '목적지구_목적지동_구로구_가리봉동', '목적지구_목적지동_구로구_개봉제1동', '목적지구_목적지동_구로구_개봉제2동', '목적지구_목적지동_구로구_개봉제3동', '목적지구_목적지동_구로구_고척제1동', '목적지구_목적지동_구로구_고척제2동', '목적지구_목적지동_구로구_구로제1동', '목적지구_목적지동_구로구_구로제2동', '목적지구_목적지동_구로구_구로제3동', '목적지구_목적지동_구로구_구로제4동', '목적지구_목적지동_구로구_구로제5동', '목적지구_목적지동_구로구_수궁동', '목적지구_목적지동_구로구_신도림동', '목적지구_목적지동_구로구_오류제1동', '목적지구_목적지동_구로구_오류제2동', '목적지구_목적지동_금천구_가산동', '목적지구_목적지동_금천구_독산제1동', '목적지구_목적지동_금천구_독산제2동', '목적지구_목적지동_금천구_시흥제1동', '목적지구_목적지동_금천구_시흥제5동', '목적지구_목적지동_노원구_공릉1.3동', '목적지구_목적지동_노원구_공릉2동', '목적지구_목적지동_노원구_상계10동', '목적지구_목적지동_노원구_상계1동', '목적지구_목적지동_노원구_상계2동', '목적지구_목적지동_노원구_상계3.4동', '목적지구_목적지동_노원구_상계5동', '목적지구_목적지동_노원구_상계6.7동', '목적지구_목적지동_노원구_상계8동', '목적지구_목적지동_노원구_상계9동', '목적지구_목적지동_노원구_월계1동', '목적지구_목적지동_노원구_월계2동', '목적지구_목적지동_노원구_월계3동', '목적지구_목적지동_노원구_중계1동', '목적지구_목적지동_노원구_중계2.3동', '목적지구_목적지동_노원구_중계4동', '목적지구_목적지동_노원구_중계본동', '목적지구_목적지동_노원구_하계1동', '목적지구_목적지동_노원구_하계2동', '목적지구_목적지동_도봉구_도봉제1동', '목적지구_목적지동_도봉구_도봉제2동', '목적지구_목적지동_도봉구_방학제1동', '목적지구_목적지동_도봉구_방학제2동', '목적지구_목적지동_도봉구_방학제3동', '목적지구_목적지동_도봉구_쌍문제1동', '목적지구_목적지동_도봉구_쌍문제2동', '목적지구_목적지동_도봉구_쌍문제3동', '목적지구_목적지동_도봉구_쌍문제4동', '목적지구_목적지동_도봉구_창제1동', '목적지구_목적지동_도봉구_창제2동', '목적지구_목적지동_도봉구_창제3동', '목적지구_목적지동_도봉구_창제4동', '목적지구_목적지동_도봉구_창제5동', '목적지구_목적지동_동대문구_답십리제1동', '목적지구_목적지동_동대문구_답십리제2동', '목적지구_목적지동_동대문구_답십리제3동', '목적지구_목적지동_동대문구_신설동', '목적지구_목적지동_동대문구_용두동', '목적지구_목적지동_동대문구_이문제1동', '목적지구_목적지동_동대문구_이문제2동', '목적지구_목적지동_동대문구_이문제3동', '목적지구_목적지동_동대문구_장안제1동', '목적지구_목적지동_동대문구_장안제2동', '목적지구_목적지동_동대문구_장안제3동', '목적지구_목적지동_동대문구_장안제4동', '목적지구_목적지동_동대문구_전농제2동', '목적지구_목적지동_동대문구_전농제3동', '목적지구_목적지동_동대문구_제기제1동', '목적지구_목적지동_동대문구_제기제2동', '목적지구_목적지동_동대문구_청량리동', '목적지구_목적지동_동대문구_회기동', '목적지구_목적지동_동대문구_휘경제1동', '목적지구_목적지동_동대문구_휘경제2동', '목적지구_목적지동_동작구_노량진제1동', '목적지구_목적지동_동작구_노량진제2동', '목적지구_목적지동_동작구_대방동', '목적지구_목적지동_동작구_상도제1동', '목적지구_목적지동_동작구_상도제2동', '목적지구_목적지동_동작구_상도제3동', '목적지구_목적지동_동작구_상도제4동', '목적지구_목적지동_동작구_신대방제1동', '목적지구_목적지동_동작구_신대방제2동', '목적지구_목적지동_동작구_흑석동', '목적지구_목적지동_마포구_공덕동', '목적지구_목적지동_마포구_대흥동', '목적지구_목적지동_마포구_도화동', '목적지구_목적지동_마포구_망원제1동', '목적지구_목적지동_마포구_망원제2동', '목적지구_목적지동_마포구_상암동', '목적지구_목적지동_마포구_서강동', '목적지구_목적지동_마포구_서교동', '목적지구_목적지동_마포구_성산제1동', '목적지구_목적지동_마포구_성산제2동', '목적지구_목적지동_마포구_신수동', '목적지구_목적지동_마포구_아현동', '목적지구_목적지동_마포구_연남동', '목적지구_목적지동_마포구_염리동', '목적지구_목적지동_마포구_용강동', '목적지구_목적지동_마포구_합정동', '목적지구_목적지동_서대문구_남가좌제1동', '목적지구_목적지동_서대문구_남가좌제2동', '목적지구_목적지동_서대문구_북가좌제1동', '목적지구_목적지동_서대문구_북가좌제2동', '목적지구_목적지동_서대문구_북아현동', '목적지구_목적지동_서대문구_신촌동', '목적지구_목적지동_서대문구_연희동', '목적지구_목적지동_서대문구_천연동', '목적지구_목적지동_서대문구_충현동', '목적지구_목적지동_서대문구_홍은제1동', '목적지구_목적지동_서대문구_홍은제2동', '목적지구_목적지동_서대문구_홍제제1동', '목적지구_목적지동_서대문구_홍제제2동', '목적지구_목적지동_서대문구_홍제제3동', '목적지구_목적지동_서초구_반포3동', '목적지구_목적지동_서초구_반포4동', '목적지구_목적지동_서초구_방배본동', '목적지구_목적지동_서초구_서초3동', '목적지구_목적지동_서초구_서초4동', '목적지구_목적지동_서초구_양재1동', '목적지구_목적지동_서초구_양재2동', '목적지구_목적지동_성동구_금호1가동', '목적지구_목적지동_성동구_금호4가동', '목적지구_목적지동_성동구_마장동', '목적지구_목적지동_성동구_사근동', '목적지구_목적지동_성동구_성수1가제1동', '목적지구_목적지동_성동구_성수2가제1동', '목적지구_목적지동_성동구_성수2가제3동', '목적지구_목적지동_성동구_송정동', '목적지구_목적지동_성동구_왕십리도선동', '목적지구_목적지동_성동구_왕십리제2동', '목적지구_목적지동_성동구_행당제1동', '목적지구_목적지동_성동구_행당제2동', '목적지구_목적지동_성북구_길음제1동', '목적지구_목적지동_성북구_길음제2동', '목적지구_목적지동_성북구_돈암제1동', '목적지구_목적지동_성북구_돈암제2동', '목적지구_목적지동_성북구_동선동', '목적지구_목적지동_성북구_보문동', '목적지구_목적지동_성북구_삼선동', '목적지구_목적지동_성북구_석관동', '목적지구_목적지동_성북구_성북동', '목적지구_목적지동_성북구_안암동', '목적지구_목적지동_성북구_월곡제1동', '목적지구_목적지동_성북구_월곡제2동', '목적지구_목적지동_성북구_장위제1동', '목적지구_목적지동_성북구_장위제2동', '목적지구_목적지동_성북구_장위제3동', '목적지구_목적지동_성북구_정릉제1동', '목적지구_목적지동_성북구_정릉제2동', '목적지구_목적지동_성북구_정릉제3동', '목적지구_목적지동_성북구_정릉제4동', '목적지구_목적지동_성북구_종암동', '목적지구_목적지동_송파구_가락1동', '목적지구_목적지동_송파구_가락2동', '목적지구_목적지동_송파구_가락본동', '목적지구_목적지동_송파구_거여1동', '목적지구_목적지동_송파구_거여2동', '목적지구_목적지동_송파구_마천1동', '목적지구_목적지동_송파구_마천2동', '목적지구_목적지동_송파구_문정1동', '목적지구_목적지동_송파구_문정2동', '목적지구_목적지동_송파구_방이1동', '목적지구_목적지동_송파구_방이2동', '목적지구_목적지동_송파구_삼전동', '목적지구_목적지동_송파구_석촌동', '목적지구_목적지동_송파구_송파1동', '목적지구_목적지동_송파구_송파2동', '목적지구_목적지동_송파구_오금동', '목적지구_목적지동_송파구_오륜동', '목적지구_목적지동_송파구_잠실2동', '목적지구_목적지동_송파구_잠실3동', '목적지구_목적지동_송파구_잠실4동', '목적지구_목적지동_송파구_잠실6동', '목적지구_목적지동_송파구_잠실7동', '목적지구_목적지동_송파구_잠실본동', '목적지구_목적지동_송파구_장지동', '목적지구_목적지동_송파구_풍납1동', '목적지구_목적지동_송파구_풍납2동', '목적지구_목적지동_양천구_목1동', '목적지구_목적지동_양천구_목2동', '목적지구_목적지동_양천구_목3동', '목적지구_목적지동_양천구_목4동', '목적지구_목적지동_양천구_목5동', '목적지구_목적지동_양천구_신월1동', '목적지구_목적지동_양천구_신월2동', '목적지구_목적지동_양천구_신월3동', '목적지구_목적지동_양천구_신월4동', '목적지구_목적지동_양천구_신월5동', '목적지구_목적지동_양천구_신월6동', '목적지구_목적지동_양천구_신월7동', '목적지구_목적지동_양천구_신정1동', '목적지구_목적지동_양천구_신정2동', '목적지구_목적지동_양천구_신정3동', '목적지구_목적지동_양천구_신정4동', '목적지구_목적지동_양천구_신정6동', '목적지구_목적지동_양천구_신정7동', '목적지구_목적지동_영등포구_당산제1동', '목적지구_목적지동_영등포구_당산제2동', '목적지구_목적지동_영등포구_대림제1동', '목적지구_목적지동_영등포구_대림제3동', '목적지구_목적지동_영등포구_도림동', '목적지구_목적지동_영등포구_문래동', '목적지구_목적지동_영등포구_신길제1동', '목적지구_목적지동_영등포구_신길제3동', '목적지구_목적지동_영등포구_신길제4동', '목적지구_목적지동_영등포구_신길제5동', '목적지구_목적지동_영등포구_신길제6동', '목적지구_목적지동_영등포구_양평제1동', '목적지구_목적지동_영등포구_양평제2동', '목적지구_목적지동_영등포구_여의동', '목적지구_목적지동_영등포구_영등포동', '목적지구_목적지동_영등포구_영등포본동', '목적지구_목적지동_용산구_남영동', '목적지구_목적지동_용산구_보광동', '목적지구_목적지동_용산구_서빙고동', '목적지구_목적지동_용산구_용문동', '목적지구_목적지동_용산구_용산2가동', '목적지구_목적지동_용산구_원효로제1동', '목적지구_목적지동_용산구_원효로제2동', '목적지구_목적지동_용산구_이촌제1동', '목적지구_목적지동_용산구_이촌제2동', '목적지구_목적지동_용산구_이태원제1동', '목적지구_목적지동_용산구_이태원제2동', '목적지구_목적지동_용산구_청파동', '목적지구_목적지동_용산구_한강로동', '목적지구_목적지동_용산구_한남동', '목적지구_목적지동_용산구_효창동', '목적지구_목적지동_용산구_후암동', '목적지구_목적지동_은평구_갈현제1동', '목적지구_목적지동_은평구_갈현제2동', '목적지구_목적지동_은평구_구산동', '목적지구_목적지동_은평구_녹번동', '목적지구_목적지동_은평구_대조동', '목적지구_목적지동_은평구_불광제1동', '목적지구_목적지동_은평구_불광제2동', '목적지구_목적지동_은평구_수색동', '목적지구_목적지동_은평구_신사제1동', '목적지구_목적지동_은평구_신사제2동', '목적지구_목적지동_은평구_역촌동', '목적지구_목적지동_은평구_응암제1동', '목적지구_목적지동_은평구_응암제2동', '목적지구_목적지동_은평구_응암제3동', '목적지구_목적지동_은평구_증산동', '목적지구_목적지동_은평구_진관동', '목적지구_목적지동_종로구_가회동', '목적지구_목적지동_종로구_교남동', '목적지구_목적지동_종로구_명륜3가동', '목적지구_목적지동_종로구_무악동', '목적지구_목적지동_종로구_부암동', '목적지구_목적지동_종로구_사직동', '목적지구_목적지동_종로구_삼청동', '목적지구_목적지동_종로구_숭인제1동', '목적지구_목적지동_종로구_숭인제2동', '목적지구_목적지동_종로구_이화동', '목적지구_목적지동_종로구_종로1.2.3.4가동', '목적지구_목적지동_종로구_종로5.6가동', '목적지구_목적지동_종로구_창신제1동', '목적지구_목적지동_종로구_창신제2동', '목적지구_목적지동_종로구_청운효자동', '목적지구_목적지동_종로구_평창동', '목적지구_목적지동_종로구_혜화동', '목적지구_목적지동_중구_광희동', '목적지구_목적지동_중구_명동', '목적지구_목적지동_중구_소공동', '목적지구_목적지동_중구_신당제1동', '목적지구_목적지동_중구_신당제2동', '목적지구_목적지동_중구_신당제3동', '목적지구_목적지동_중구_신당제5동', '목적지구_목적지동_중구_신당제6동', '목적지구_목적지동_중구_영종동', '목적지구_목적지동_중구_을지로동',
                        '목적지구_목적지동_중구_장충동', '목적지구_목적지동_중구_중림동', '목적지구_목적지동_중구_필동', '목적지구_목적지동_중구_황학동', '목적지구_목적지동_중구_회현동', '목적지구_목적지동_중랑구_망우본동', '목적지구_목적지동_중랑구_망우제3동', '목적지구_목적지동_중랑구_면목본동', '목적지구_목적지동_중랑구_면목제2동', '목적지구_목적지동_중랑구_면목제3.8동', '목적지구_목적지동_중랑구_면목제4동', '목적지구_목적지동_중랑구_면목제5동', '목적지구_목적지동_중랑구_면목제7동', '목적지구_목적지동_중랑구_묵제1동', '목적지구_목적지동_중랑구_묵제2동', '목적지구_목적지동_중랑구_상봉제1동', '목적지구_목적지동_중랑구_상봉제2동', '목적지구_목적지동_중랑구_신내1동', '목적지구_목적지동_중랑구_신내2동', '목적지구_목적지동_중랑구_중화제1동', '목적지구_목적지동_중랑구_중화제2동']

#### 2-4-1 (1) 표준화 (Standardization)

In [None]:
from sklearn.preprocessing import StandardScaler

# 연속형 변수만 스케일링
scaler = StandardScaler()
X_train_continuous_scaled = scaler.fit_transform(X_train_oh[continuous_features])
X_val_continuous_scaled = scaler.transform(X_val_oh[continuous_features])
X_test_continuous_scaled = scaler.transform(X_test_oh[continuous_features])

# 범주형 변수는 그대로 사용
X_train_categorical = X_train_oh[categorical_features]
X_val_categorical = X_val_oh[categorical_features]
X_test_categorical = X_test_oh[categorical_features]

# 스케일링된 연속형 변수와 범주형 변수 병합
X_train_oh_scaled_1 = pd.concat([pd.DataFrame(X_train_continuous_scaled, columns=continuous_features), X_train_categorical.reset_index(drop=True)],
                           axis=1)
X_val_oh_scaled_1 = pd.concat([pd.DataFrame(X_val_continuous_scaled, columns=continuous_features), X_val_categorical.reset_index(drop=True)],
                         axis=1)
X_test_oh_scaled_1 = pd.concat([pd.DataFrame(X_test_continuous_scaled, columns=continuous_features), X_test_categorical.reset_index(drop=True)],
                         axis=1)

#### 2-4-1 (2) 최소-최대 스케일링 (Min-Max Scaling)

In [None]:
from sklearn.preprocessing import MinMaxScaler

# 연속형 변수만 스케일링
scaler = MinMaxScaler()
X_train_continuous_scaled = scaler.fit_transform(X_train_oh[continuous_features])
X_val_continuous_scaled = scaler.transform(X_val_oh[continuous_features])
X_test_continuous_scaled = scaler.transform(X_test_oh[continuous_features])

# 범주형 변수는 그대로 사용
X_train_categorical = X_train_oh[categorical_features]
X_val_categorical = X_val_oh[categorical_features]
X_test_categorical = X_test_oh[categorical_features]

# 스케일링된 연속형 변수와 범주형 변수 병합
X_train_oh_scaled_2 = pd.concat([pd.DataFrame(X_train_continuous_scaled, columns=continuous_features), X_train_categorical.reset_index(drop=True)],
                           axis=1)
X_val_oh_scaled_2 = pd.concat([pd.DataFrame(X_val_continuous_scaled, columns=continuous_features), X_val_categorical.reset_index(drop=True)],
                         axis=1)
X_test_oh_scaled_2 = pd.concat([pd.DataFrame(X_test_continuous_scaled, columns=continuous_features), X_test_categorical.reset_index(drop=True)],
                         axis=1)

####  2-4-1 (3) 로버스트 스케일링 (Robust Scaling)

In [None]:
from sklearn.preprocessing import RobustScaler

# 연속형 변수만 스케일링
scaler = RobustScaler()
X_train_continuous_scaled = scaler.fit_transform(X_train_oh[continuous_features])
X_val_continuous_scaled = scaler.transform(X_val_oh[continuous_features])
X_test_continuous_scaled = scaler.transform(X_test_oh[continuous_features])

# 범주형 변수는 그대로 사용
X_train_categorical = X_train_oh[categorical_features]
X_val_categorical = X_val_oh[categorical_features]
X_test_categorical = X_test_oh[categorical_features]

# 스케일링된 연속형 변수와 범주형 변수 병합
X_train_oh_scaled_3 = pd.concat([pd.DataFrame(X_train_continuous_scaled, columns=continuous_features), X_train_categorical.reset_index(drop=True)],
                           axis=1)
X_val_oh_scaled_3 = pd.concat([pd.DataFrame(X_val_continuous_scaled, columns=continuous_features), X_val_categorical.reset_index(drop=True)],
                         axis=1)
X_test_oh_scaled_3 = pd.concat([pd.DataFrame(X_test_continuous_scaled, columns=continuous_features), X_test_categorical.reset_index(drop=True)],
                         axis=1)

### 2-4-2 레이블 인코딩 적용 데이터로 스케일링 3가지

랜덤 포레스트, GBM, AdaBoost, XGBoost에서 사용

In [None]:
# 연속형 변수와  범주형 변수 분리
continuous_features = ['요금', '승차거리', '평균기온(°C)', '일강수량(mm)', '안개 계속시간(hr)', '접수시간']
categorical_features = ['공휴일', '차고지', '출발지구_출발지동', '목적지구_목적지동']

#### 2-4-2 (1) 표준화 (Standardization)

In [None]:
# 연속형 변수만 스케일링
scaler = StandardScaler()
X_train_continuous_scaled = scaler.fit_transform(X_train_lb[continuous_features])
X_val_continuous_scaled = scaler.transform(X_val_lb[continuous_features])
X_test_continuous_scaled = scaler.transform(X_test_lb[continuous_features])

# 범주형 변수는 그대로 사용
X_train_categorical = X_train_lb[categorical_features]
X_val_categorical = X_val_lb[categorical_features]
X_test_categorical = X_test_lb[categorical_features]

# 스케일링된 연속형 변수와 범주형 변수 병합
X_train_lb_scaled_1 = pd.concat([pd.DataFrame(X_train_continuous_scaled, columns=continuous_features), X_train_categorical.reset_index(drop=True)],
                           axis=1)
X_val_lb_scaled_1 = pd.concat([pd.DataFrame(X_val_continuous_scaled, columns=continuous_features), X_val_categorical.reset_index(drop=True)],
                         axis=1)
X_test_lb_scaled_1 = pd.concat([pd.DataFrame(X_test_continuous_scaled, columns=continuous_features), X_test_categorical.reset_index(drop=True)],
                         axis=1)

#### 2-4-2 (2) 최소-최대 스케일링 (Min-Max Scaling)

In [None]:
# 연속형 변수만 스케일링
scaler = MinMaxScaler()
X_train_continuous_scaled = scaler.fit_transform(X_train_lb[continuous_features])
X_val_continuous_scaled = scaler.transform(X_val_lb[continuous_features])
X_test_continuous_scaled = scaler.transform(X_test_lb[continuous_features])

# 범주형 변수는 그대로 사용
X_train_categorical = X_train_lb[categorical_features]
X_val_categorical = X_val_lb[categorical_features]
X_test_categorical = X_test_lb[categorical_features]

# 스케일링된 연속형 변수와 범주형 변수 병합
X_train_lb_scaled_2 = pd.concat([pd.DataFrame(X_train_continuous_scaled, columns=continuous_features), X_train_categorical.reset_index(drop=True)],
                           axis=1)
X_val_lb_scaled_2 = pd.concat([pd.DataFrame(X_val_continuous_scaled, columns=continuous_features), X_val_categorical.reset_index(drop=True)],
                         axis=1)
X_test_lb_scaled_2 = pd.concat([pd.DataFrame(X_test_continuous_scaled, columns=continuous_features), X_test_categorical.reset_index(drop=True)],
                         axis=1)

####  2-4-2 (3) 로버스트 스케일링 (Robust Scaling)

In [None]:
# 연속형 변수만 스케일링
scaler = RobustScaler()
X_train_continuous_scaled = scaler.fit_transform(X_train_lb[continuous_features])
X_val_continuous_scaled = scaler.transform(X_val_lb[continuous_features])
X_test_continuous_scaled = scaler.transform(X_test_lb[continuous_features])

# 범주형 변수는 그대로 사용
X_train_categorical = X_train_lb[categorical_features]
X_val_categorical = X_val_lb[categorical_features]
X_test_categorical = X_test_lb[categorical_features]

# 스케일링된 연속형 변수와 범주형 변수 병합
X_train_lb_scaled_3 = pd.concat([pd.DataFrame(X_train_continuous_scaled, columns=continuous_features), X_train_categorical.reset_index(drop=True)],
                           axis=1)
X_val_lb_scaled_3 = pd.concat([pd.DataFrame(X_val_continuous_scaled, columns=continuous_features), X_val_categorical.reset_index(drop=True)],
                         axis=1)
X_test_lb_scaled_3 = pd.concat([pd.DataFrame(X_test_continuous_scaled, columns=continuous_features), X_test_categorical.reset_index(drop=True)],
                         axis=1)

# 😎 3. 여러 회귀 모델 적용

- 선형 회귀, 릿지 회귀, 라쏘 회귀, MLP, 랜덤 포레스트, GBM, LightGBM, CatBoost, AdaBoost, XGBoost

## 🔴 3-1 선형 회귀

- 스케일링 필요 여부 : 필수
- 인코딩 : 원핫인코딩

### 🔴 선형 회귀 모델 학습

In [None]:
from sklearn.linear_model import LinearRegression

# 스케일링 : StandardScaler
lr_model_1 = LinearRegression()
lr_model_1.fit(X_train_oh_scaled_1, y_train_oh)

# 스케일링 : MinMaxScaler
lr_model_2 = LinearRegression()
lr_model_2.fit(X_train_oh_scaled_2, y_train_oh)

# 스케일링 : RobustScaler
lr_model_3 = LinearRegression()
lr_model_3.fit(X_train_oh_scaled_3, y_train_oh)

LinearRegression()

### 🔴 선형회귀 예측 및 평가

In [None]:
# 스케일링 : StandardScaler

# 검증 데이터 예측
y_val_pred = lr_model_1.predict(X_val_oh_scaled_1)

# 검증 데이터 평가
mse = mean_squared_error(y_val_oh, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_oh, y_val_pred)
r2 = r2_score(y_val_oh, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 113302330558677952.0000
Validation RMSE: 336604115.4809
Validation MAE: 1279727.8743
Validation R²: -121236826158598.0000


In [None]:
# 스케일링 : MinMaxScaler

# 검증 데이터 예측
y_val_pred = lr_model_2.predict(X_val_oh_scaled_2)

# 검증 데이터 평가
mse = mean_squared_error(y_val_oh, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_oh, y_val_pred)
r2 = r2_score(y_val_oh, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 4944863554317101056.0000
Validation RMSE: 2223704916.1966
Validation MAE: 14618683.2153
Validation R²: -5291149441998993.0000


In [None]:
# 스케일링 : RobustScaler

# 검증 데이터 예측
y_val_pred = lr_model_3.predict(X_val_oh_scaled_3)

# 검증 데이터 평가
mse = mean_squared_error(y_val_oh, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_oh, y_val_pred)
r2 = r2_score(y_val_oh, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 845.4494
Validation RMSE: 29.0766
Validation MAE: 22.3267
Validation R²: 0.0953


## 🟠 3-2 릿지 회귀

- 스케일링 필요 여부 : 필수
- 인코딩 : 원핫인코딩

### 🟠 릿지 회귀 하이퍼파라미터 튜닝

최적의 alpha 값 찾기

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}
ridge_model_1 = Ridge()
grid = GridSearchCV(ridge_model_1, param_grid, cv=5)
grid.fit(X_train_oh_scaled_1, y_train_oh)
print("최적의 파라미터(StandardScaler):", grid.best_params_)

param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}
ridge_model_2 = Ridge()
grid = GridSearchCV(ridge_model_2, param_grid, cv=5)
grid.fit(X_train_oh_scaled_2, y_train_oh)
print("최적의 파라미터(MinMaxScaler):", grid.best_params_)

param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}
ridge_model_3 = Ridge()
grid = GridSearchCV(ridge_model_3, param_grid, cv=5)
grid.fit(X_train_oh_scaled_3, y_train_oh)
print("최적의 파라미터(RobustScaler):", grid.best_params_)

최적의 파라미터(StandardScaler): {'alpha': 10}
최적의 파라미터(MinMaxScaler): {'alpha': 10}
최적의 파라미터(RobustScaler): {'alpha': 10}


### 🟠 릿지 회귀 모델 학습

In [None]:
# 스케일링 종류별로 학습
# 스케일링 : StandardScaler
ridge_model_1 = Ridge(alpha=10)
ridge_model_1.fit(X_train_oh_scaled_1, y_train_oh)

# 스케일링 : MinMaxScaler
ridge_model_2 = Ridge(alpha=10)
ridge_model_2.fit(X_train_oh_scaled_2, y_train_oh)

# 스케일링 : RobustScaler
ridge_model_3 = Ridge(alpha=10)
ridge_model_3.fit(X_train_oh_scaled_3, y_train_oh)

Ridge(alpha=10)

### 🟠 릿지 회귀 예측 및 평가

In [None]:
# 스케일링 : StandardScaler

# 검증 데이터 예측
y_val_pred = ridge_model_1.predict(X_val_oh_scaled_1)

# 검증 데이터 평가
mse = mean_squared_error(y_val_oh, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_oh, y_val_pred)
r2 = r2_score(y_val_oh, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 845.2493
Validation RMSE: 29.0732
Validation MAE: 22.3266
Validation R²: 0.0956


In [None]:
# 스케일링 : MinMaxScaler

# 검증 데이터 예측
y_val_pred = ridge_model_2.predict(X_val_oh_scaled_2)

# 검증 데이터 평가
mse = mean_squared_error(y_val_oh, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_oh, y_val_pred)
r2 = r2_score(y_val_oh, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 845.2617
Validation RMSE: 29.0734
Validation MAE: 22.3268
Validation R²: 0.0955


In [None]:
# 스케일링 : RobustScaler

# 검증 데이터 예측
y_val_pred = ridge_model_3.predict(X_val_oh_scaled_3)

# 검증 데이터 평가
mse = mean_squared_error(y_val_oh, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_oh, y_val_pred)
r2 = r2_score(y_val_oh, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 845.2493
Validation RMSE: 29.0732
Validation MAE: 22.3266
Validation R²: 0.0956


## 🟡 3-3 라쏘 회귀

- 스케일링 필요 여부 : 필수
- 인코딩 : 원핫인코딩

### 🟡 라쏘 회귀 하이퍼파라미터 튜닝

최적의 alpha 값 찾기

In [None]:
from sklearn.linear_model import Lasso

param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}
lasso_model_1 = Lasso()
grid = GridSearchCV(lasso_model_1, param_grid, cv=5)
grid.fit(X_train_oh_scaled_1, y_train_oh)
print("최적의 파라미터(StandardScaler):", grid.best_params_)

param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}
lasso_model_2 = Lasso()
grid = GridSearchCV(lasso_model_2, param_grid, cv=5)
grid.fit(X_train_oh_scaled_2, y_train_oh)
print("최적의 파라미터(MinMaxScaler):", grid.best_params_)

param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}
lasso_model_3 = Lasso()
grid = GridSearchCV(lasso_model_3, param_grid, cv=5)
grid.fit(X_train_oh_scaled_3, y_train_oh)
print("최적의 파라미터(RobustScaler):", grid.best_params_)

최적의 파라미터(StandardScaler): {'alpha': 0.01}
최적의 파라미터(MinMaxScaler): {'alpha': 0.01}
최적의 파라미터(RobustScaler): {'alpha': 0.01}


### 🟡 라쏘 회귀 모델 학습

In [None]:
# 스케일링 종류별로 학습
# 스케일링 : StandardScaler
lasso_model_1 = Lasso(alpha=0.01)
lasso_model_1.fit(X_train_oh_scaled_1, y_train_oh)

# 스케일링 : MinMaxScaler
lasso_model_2 = Lasso(alpha=0.01)
lasso_model_2.fit(X_train_oh_scaled_2, y_train_oh)

# 스케일링 : RobustScaler
lasso_model_3 = Lasso(alpha=0.01)
lasso_model_3.fit(X_train_oh_scaled_3, y_train_oh)

Lasso(alpha=0.01)

### 🟡 라쏘 회귀 예측 및 평가

In [None]:
# 스케일링 : StandardScaler

# 검증 데이터 예측
y_val_pred = lasso_model_1.predict(X_val_oh_scaled_1)

# 검증 데이터 평가
mse = mean_squared_error(y_val_oh, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_oh, y_val_pred)
r2 = r2_score(y_val_oh, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 852.3737
Validation RMSE: 29.1954
Validation MAE: 22.4454
Validation R²: 0.0879


In [None]:
# 스케일링 : MinMaxScaler

# 검증 데이터 예측
y_val_pred = lasso_model_2.predict(X_val_oh_scaled_2)

# 검증 데이터 평가
mse = mean_squared_error(y_val_oh, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_oh, y_val_pred)
r2 = r2_score(y_val_oh, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 852.5407
Validation RMSE: 29.1983
Validation MAE: 22.4501
Validation R²: 0.0878


In [None]:
# 스케일링 : RobustScaler

# 검증 데이터 예측
y_val_pred = lasso_model_3.predict(X_val_oh_scaled_3)

# 검증 데이터 평가
rmse = np.sqrt(mse)
mse = mean_squared_error(y_val_oh, y_val_pred)
mae = mean_absolute_error(y_val_oh, y_val_pred)
r2 = r2_score(y_val_oh, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 852.3806
Validation RMSE: 29.1983
Validation MAE: 22.4456
Validation R²: 0.0879


## 🟢 3-4 MLP

- 스케일링 필요 여부 : 필수
- 인코딩 필요 여부 : 원핫인코딩

### 🟢 MLP 모델 학습

In [None]:
from sklearn.neural_network import MLPRegressor

# 스케일링 : StandardScaler
mlp1 = MLPRegressor(hidden_layer_sizes=(100, 50), # 은닉층 2개: 100개, 50개 뉴런
                    activation='relu',            # 활성화 함수: ReLU
                    solver='adam',                # 최적화 알고리즘: Adam
                    learning_rate='adaptive',     # 학습률 조정
                    max_iter=200,                 # 최대 반복 횟수
                    random_state=42,
                    verbose=True)
mlp1.fit(X_train_oh_scaled_1, y_train_oh)


# 스케일링 : MinMaxScaler
mlp2 = MLPRegressor(hidden_layer_sizes=(100, 50), # 은닉층 2개: 100개, 50개 뉴런
                    activation='relu',            # 활성화 함수: ReLU
                    solver='adam',                # 최적화 알고리즘: Adam
                    learning_rate='adaptive',     # 학습률 조정
                    max_iter=200,                 # 최대 반복 횟수
                    random_state=42,
                    verbose=True)
mlp2.fit(X_train_oh_scaled_2, y_train_oh)


# 스케일링 : RobustScaler
mlp3 = MLPRegressor(hidden_layer_sizes=(100, 50), # 은닉층 2개: 100개, 50개 뉴런
                    activation='relu',            # 활성화 함수: ReLU
                    solver='adam',                # 최적화 알고리즘: Adam
                    learning_rate='adaptive',     # 학습률 조정
                    max_iter=200,                 # 최대 반복 횟수
                    random_state=42,
                    verbose=True)
mlp3.fit(X_train_oh_scaled_3, y_train_oh)

Iteration 1, loss = 388.99918960
Iteration 2, loss = 314.80247418
Iteration 3, loss = 295.08923033
Iteration 4, loss = 285.43145990
Iteration 5, loss = 277.97787232
Iteration 6, loss = 272.17869611
Iteration 7, loss = 266.63882756
Iteration 8, loss = 262.51022761
Iteration 9, loss = 258.63234132
Iteration 10, loss = 255.83705166
Iteration 11, loss = 253.14791886
Iteration 12, loss = 250.92960249
Iteration 13, loss = 249.28751646
Iteration 14, loss = 247.50755292
Iteration 15, loss = 246.11115912
Iteration 16, loss = 244.71971727
Iteration 17, loss = 243.64957065
Iteration 18, loss = 242.47118617
Iteration 19, loss = 241.49630962
Iteration 20, loss = 240.77770516
Iteration 21, loss = 239.52243414
Iteration 22, loss = 238.60201245
Iteration 23, loss = 237.72835022
Iteration 24, loss = 236.90539472
Iteration 25, loss = 236.17716652
Iteration 26, loss = 235.46897176
Iteration 27, loss = 234.90205613
Iteration 28, loss = 234.72900197
Iteration 29, loss = 233.46691340
Iteration 30, loss = 23

Iteration 40, loss = 244.83987924
Iteration 41, loss = 244.36982661
Iteration 42, loss = 243.67966288
Iteration 43, loss = 243.37217439
Iteration 44, loss = 243.17670423
Iteration 45, loss = 242.50006447
Iteration 46, loss = 241.88965250
Iteration 47, loss = 241.61548267
Iteration 48, loss = 240.92582602
Iteration 49, loss = 240.64985580
Iteration 50, loss = 240.57502961
Iteration 51, loss = 239.97178238
Iteration 52, loss = 239.61110141
Iteration 53, loss = 239.33819254
Iteration 54, loss = 239.06098557
Iteration 55, loss = 238.49406576
Iteration 56, loss = 237.97544636
Iteration 57, loss = 237.63416449
Iteration 58, loss = 237.14418292
Iteration 59, loss = 236.98108064
Iteration 60, loss = 236.54438728
Iteration 61, loss = 236.10202853
Iteration 62, loss = 235.90230839
Iteration 63, loss = 235.62794689
Iteration 64, loss = 235.17591579
Iteration 65, loss = 234.91442428
Iteration 66, loss = 234.79786878
Iteration 67, loss = 234.39588847
Iteration 68, loss = 234.15723718
Iteration 69, 

MLPRegressor(hidden_layer_sizes=(100, 50), learning_rate='adaptive',
             random_state=42, verbose=True)

### 🟢 MLP 예측 및 평가

In [None]:
# 스케일링 : StandardScaler

# 검증 데이터 예측
y_val_pred = mlp1.predict(X_val_oh_scaled_1)

# 검증 데이터 평가
mse = mean_squared_error(y_val_oh, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_oh, y_val_pred)
r2 = r2_score(y_val_oh, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 482.0866
Validation RMSE: 21.9565
Validation MAE: 15.9669
Validation R²: 0.4842


In [None]:
# 스케일링 : MinMaxScaler

# 검증 데이터 예측
y_val_pred = mlp2.predict(X_val_oh_scaled_2)

# 검증 데이터 평가
mse = mean_squared_error(y_val_oh, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_oh, y_val_pred)
r2 = r2_score(y_val_oh, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 482.6021
Validation RMSE: 21.9682
Validation MAE: 16.0010
Validation R²: 0.4836


In [None]:
# 스케일링 : RobustScaler

# 검증 데이터 예측
y_val_pred = mlp3.predict(X_val_oh_scaled_3)

# 검증 데이터 평가
mse = mean_squared_error(y_val_oh, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_oh, y_val_pred)
r2 = r2_score(y_val_oh, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 2670.7607
Validation RMSE: 51.6794
Validation MAE: 33.1129
Validation R²: -1.8578


## 🟥 3-5 랜덤 포레스트

- 스케일링 필요 여부 : 필수X
- 인코딩 : 레이블인코딩

### 🟥 랜덤 포레스트 모델 학습

In [None]:
from sklearn.ensemble import RandomForestRegressor

# 스케일링 : 없음
rf_model_0 = RandomForestRegressor(n_estimators=100,  # 트리 개수
                                   max_depth=None,   # 트리의 최대 깊이
                                   random_state=42)
rf_model_0.fit(X_train_lb, y_train_lb)


# 스케일링 : StandardScaler
rf_model_1 = RandomForestRegressor(n_estimators=100,  # 트리 개수
                                   max_depth=None,   # 트리의 최대 깊이
                                   random_state=42)
rf_model_1.fit(X_train_lb_scaled_1, y_train_lb)


# 스케일링 : MinMaxScaler
rf_model_2 = RandomForestRegressor(n_estimators=100,  # 트리 개수
                                   max_depth=None,   # 트리의 최대 깊이
                                   random_state=42)
rf_model_2.fit(X_train_lb_scaled_2, y_train_lb)


# 스케일링 : RobustScaler
rf_model_3 = RandomForestRegressor(n_estimators=100,  # 트리 개수
                                   max_depth=None,   # 트리의 최대 깊이
                                   random_state=42)
rf_model_3.fit(X_train_lb_scaled_3, y_train_lb)

RandomForestRegressor(random_state=42)

### 🟥 랜덤 포레스트 예측 및 평가

In [None]:
# 스케일링 : 없음

# 검증 데이터 예측
y_val_pred = rf_model_0.predict(X_val_lb)

# 검증 데이터 평가
mse = mean_squared_error(y_val_lb, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_lb, y_val_pred)
r2 = r2_score(y_val_lb, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 435.5207
Validation RMSE: 20.8691
Validation MAE: 14.9897
Validation R²: 0.5340


In [None]:
# 스케일링 : StandardScaler

# 검증 데이터 예측
y_val_pred = rf_model_1.predict(X_val_lb_scaled_1)

# 검증 데이터 평가
mse = mean_squared_error(y_val_lb, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_lb, y_val_pred)
r2 = r2_score(y_val_lb, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 435.5583
Validation RMSE: 20.8700
Validation MAE: 14.9894
Validation R²: 0.5339


In [None]:
# 스케일링 : MinMaxScaler

# 검증 데이터 예측
y_val_pred = rf_model_2.predict(X_val_lb_scaled_2)

# 검증 데이터 평가
mse = mean_squared_error(y_val_lb, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_lb, y_val_pred)
r2 = r2_score(y_val_lb, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 435.5527
Validation RMSE: 20.8699
Validation MAE: 14.9893
Validation R²: 0.5339


In [None]:
# 스케일링 : RobustScaler

# 검증 데이터 예측
y_val_pred = rf_model_3.predict(X_val_lb_scaled_3)

# 검증 데이터 평가
mse = mean_squared_error(y_val_lb, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_lb, y_val_pred)
r2 = r2_score(y_val_lb, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 435.5375
Validation RMSE: 20.8695
Validation MAE: 14.9895
Validation R²: 0.5340


## 🟧 3-6 GBM

- 스케일링 필요 여부 : 필수X
- 인코딩 : 레이블인코딩

### 🟧 GBM 모델 학습

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# 스케일링 : 없음
gbm_model_0 = GradientBoostingRegressor(n_estimators=100,    # 트리 개수
                                         learning_rate=0.1,   # 학습률
                                         max_depth=3,         # 트리 최대 깊이
                                         random_state=42)
gbm_model_0.fit(X_train_lb, y_train_lb)


# 스케일링 : StandardScaler
gbm_model_1 = GradientBoostingRegressor(n_estimators=100,    # 트리 개수
                                         learning_rate=0.1,   # 학습률
                                         max_depth=3,         # 트리 최대 깊이
                                         random_state=42)
gbm_model_1.fit(X_train_lb_scaled_1, y_train_lb)


# 스케일링 : MinMaxScaler
gbm_model_2 = GradientBoostingRegressor(n_estimators=100,    # 트리 개수
                                         learning_rate=0.1,   # 학습률
                                         max_depth=3,         # 트리 최대 깊이
                                         random_state=42)
gbm_model_2.fit(X_train_lb_scaled_2, y_train_lb)


# 스케일링 : RobustScaler
gbm_model_3 = GradientBoostingRegressor(n_estimators=100,    # 트리 개수
                                         learning_rate=0.1,   # 학습률
                                         max_depth=3,         # 트리 최대 깊이
                                         random_state=42)
gbm_model_3.fit(X_train_lb_scaled_3, y_train_lb)

GradientBoostingRegressor(random_state=42)

### 🟧 GBM 예측 및 평가

In [None]:
# 스케일링 : 없음

# 검증 데이터 예측
y_val_pred = gbm_model_0.predict(X_val_lb)

# 검증 데이터 평가
mse = mean_squared_error(y_val_lb, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_lb, y_val_pred)
r2 = r2_score(y_val_lb, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 656.0516
Validation RMSE: 25.6135
Validation MAE: 18.4102
Validation R²: 0.2980


In [None]:
# 스케일링 : StandardScaler

# 검증 데이터 예측
y_val_pred = gbm_model_1.predict(X_val_lb_scaled_1)

# 검증 데이터 평가
mse = mean_squared_error(y_val_lb, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_lb, y_val_pred)
r2 = r2_score(y_val_lb, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 656.0516
Validation RMSE: 25.6135
Validation MAE: 18.4102
Validation R²: 0.2980


In [None]:
# 스케일링 : MinMaxScaler

# 검증 데이터 예측
y_val_pred = gbm_model_2.predict(X_val_lb_scaled_2)

# 검증 데이터 평가
mse = mean_squared_error(y_val_lb, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_lb, y_val_pred)
r2 = r2_score(y_val_lb, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 656.0516
Validation RMSE: 25.6135
Validation MAE: 18.4102
Validation R²: 0.2980


In [None]:
# 스케일링 : RobustScaler

# 검증 데이터 예측
y_val_pred = gbm_model_3.predict(X_val_lb_scaled_3)

# 검증 데이터 평가
mse = mean_squared_error(y_val_lb, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_lb, y_val_pred)
r2 = r2_score(y_val_lb, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 656.0516
Validation RMSE: 25.6135
Validation MAE: 18.4102
Validation R²: 0.2980


## 🟨 3-7 LightGBM

- 스케일링 필요 여부 : 필수X
- 인코딩 : X (-> 인코딩을 안함 = 스케일링 못함)

In [None]:
# !pip install lightgbm==3.3.2  # early_stopping_rounds 에러 해결을 위함

### 🟨 LightGBM 모델 학습

In [None]:
import lightgbm as lgb

# 범주형 변수 컬럼만 지정
categorical_cols = ["출발지구_출발지동", "목적지구_목적지동"]

# 범주형 변수만 'category' 타입으로 변환
for col in categorical_cols:
    X_train[col] = X_train[col].astype('category')
    X_val[col] = X_val[col].astype('category')
    X_test[col] = X_test[col].astype('category')

# LightGBM 데이터셋 생성 (훈련 데이터와 검증 데이터를 각각 Dataset 형태로 변환)
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

# 하이퍼파라미터 설정
params = {'objective': 'regression',      # 회귀 문제 설정
          'metric': 'rmse',               # 평가 지표: RMSE (Root Mean Squared Error)
          'boosting_type': 'gbdt',        # 기본 부스팅 방식: Gradient Boosting Decision Tree
          'learning_rate': 0.1,           # 학습률
          'num_leaves': 31,               # 리프 노드 개수
          'max_depth': -1,                # 트리 최대 깊이
          'min_data_in_leaf': 20,         # 리프 노드에 최소 데이터 개수
          'verbose': -1}                  # 훈련 중 로그를 줄이기

# LightGBM 모델 훈련
lgb_model = lgb.train(params,                            # 설정한 하이퍼파라미터
                      train_data,                        # 훈련 데이터
                      valid_sets=[train_data, val_data], # 훈련 및 검증 데이터 지정
                      num_boost_round=1000,              # 최대 부스팅 반복 횟수
                      early_stopping_rounds=50,          # 검증 성능이 개선되지 않으면 조기 종료
                      verbose_eval=100)                  # 100번째마다 진행 상황 출력

Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 22.0611	valid_1's rmse: 22.2628
[200]	training's rmse: 21.3291	valid_1's rmse: 21.6963
[300]	training's rmse: 20.7829	valid_1's rmse: 21.291
[400]	training's rmse: 20.3674	valid_1's rmse: 21.0037
[500]	training's rmse: 20.0468	valid_1's rmse: 20.792
[600]	training's rmse: 19.7864	valid_1's rmse: 20.6408
[700]	training's rmse: 19.5557	valid_1's rmse: 20.5128
[800]	training's rmse: 19.3445	valid_1's rmse: 20.4054
[900]	training's rmse: 19.1666	valid_1's rmse: 20.3186
[1000]	training's rmse: 19.0081	valid_1's rmse: 20.2498
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 19.0081	valid_1's rmse: 20.2498


### 🟨 LightGBM 예측 및 평가

In [None]:
# 검증 데이터 예측
y_val_pred = lgb_model.predict(X_val)

# 검증 데이터 평가
mse = mean_squared_error(y_val, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 410.0559
Validation RMSE: 20.2498
Validation MAE: 14.6488
Validation R²: 0.5612


## 🟩 3-8 CatBoost


- 스케일링 필요 여부 : 필수X
- 인코딩 : X (-> 인코딩을 안함 = 스케일링 못함)

### 🟩 CatBoost 모델 학습

In [None]:
from catboost import CatBoostRegressor

# 범주형 변수 지정
categorical_features = ["출발지구_출발지동", "목적지구_목적지동"]

# CatBoostRegressor 모델 생성
cat_model = CatBoostRegressor(iterations=500,                      # 트리 개수 (조정 가능)
                               depth=6,                            # 트리 깊이
                               learning_rate=0.1,                  # 학습률
                               cat_features=categorical_features,  # 범주형 변수 지정
                               verbose=100)                        # 학습 로그 출력

# 모델 학습
cat_model.fit(X_train, y_train)

0:	learn: 29.8883306	total: 248ms	remaining: 2m 3s
100:	learn: 24.3278282	total: 7.4s	remaining: 29.3s
200:	learn: 23.7054716	total: 14.8s	remaining: 22s
300:	learn: 23.3213530	total: 22.2s	remaining: 14.6s
400:	learn: 23.0370699	total: 29.2s	remaining: 7.2s
499:	learn: 22.7997182	total: 36.3s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x290095c24c0>

### 🟩 CatBoost 예측 및 평가

In [None]:
# 검증 데이터 예측
y_val_pred = cat_model.predict(X_val)

# 검증 데이터 평가
mse = mean_squared_error(y_val, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 512.5251
Validation RMSE: 22.6390
Validation MAE: 16.2026
Validation R²: 0.4516


# 🟦 3-9 AdaBoost

- 스케일링 필요 여부 : 필수X
- 인코딩 : 레이블인코딩

### 🟦 AdaBoost 모델 학습

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

# 스케일링 : 없음
adaboost_model_0 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=5),  # 기본 학습기: 깊이 5 결정 트리
                                     n_estimators=200,                                   # 트리 개수
                                     learning_rate=0.1,                                  # 학습률
                                     random_state=42)
adaboost_model_0.fit(X_train_lb, y_train_lb)


# 스케일링 : StandardScaler
adaboost_model_1 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=5),  # 기본 학습기: 깊이 5 결정 트리
                                     n_estimators=200,                                   # 트리 개수
                                     learning_rate=0.1,                                  # 학습률
                                     random_state=42)
adaboost_model_1.fit(X_train_lb_scaled_1, y_train_lb)


# 스케일링 : MinMaxScaler
adaboost_model_2 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=5),  # 기본 학습기: 깊이 5 결정 트리
                                     n_estimators=200,                                   # 트리 개수
                                     learning_rate=0.1,                                  # 학습률
                                     random_state=42)
adaboost_model_2.fit(X_train_lb_scaled_2, y_train_lb)


# 스케일링 : RobustScaler
adaboost_model_3 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=5),  # 기본 학습기: 깊이 5 결정 트리
                                     n_estimators=200,                                   # 트리 개수
                                     learning_rate=0.1,                                  # 학습률
                                     random_state=42)
adaboost_model_3.fit(X_train_lb_scaled_3, y_train_lb)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=5),
                  learning_rate=0.1, n_estimators=200, random_state=42)

### 🟦 AdaBoost 예측 및 평가

In [None]:
# 스케일링 : 없음

# 검증 데이터 예측
y_val_pred = adaboost_model_0.predict(X_val_lb)

# 검증 데이터 평가
mse = mean_squared_error(y_val_lb, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_lb, y_val_pred)
r2 = r2_score(y_val_lb, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 996.7602
Validation RMSE: 31.5715
Validation MAE: 27.3309
Validation R²: -0.0666


In [None]:
# 스케일링 : StandardScaler

# 검증 데이터 예측
y_val_pred = adaboost_model_1.predict(X_val_lb_scaled_1)

# 검증 데이터 평가
mse = mean_squared_error(y_val_lb, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_lb, y_val_pred)
r2 = r2_score(y_val_lb, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 943.4320
Validation RMSE: 30.7153
Validation MAE: 26.3460
Validation R²: -0.0095


In [None]:
# 스케일링 : MinMaxScaler

# 검증 데이터 예측
y_val_pred = adaboost_model_2.predict(X_val_lb_scaled_2)

# 검증 데이터 평가
mse = mean_squared_error(y_val_lb, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_lb, y_val_pred)
r2 = r2_score(y_val_lb, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 999.7534
Validation RMSE: 31.6189
Validation MAE: 27.3866
Validation R²: -0.0698


In [None]:
# 스케일링 : RobustScaler

# 검증 데이터 예측
y_val_pred = adaboost_model_3.predict(X_val_lb_scaled_3)

# 검증 데이터 평가
mse = mean_squared_error(y_val_lb, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_lb, y_val_pred)
r2 = r2_score(y_val_lb, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 982.9318
Validation RMSE: 31.3517
Validation MAE: 27.0779
Validation R²: -0.0518


## 🟪 3-10 XGBoost

- 스케일링 필요 여부 : 필수X
- 인코딩 : 레이블인코딩

### 🟪 XGBoost 모델 학습

In [None]:
from xgboost import XGBRegressor

# 스케일링 : 없음
xgb_model_0 = XGBRegressor(random_state=42,
                           n_estimators=100,   # 트리 개수
                           learning_rate=0.1)  # 학습률
xgb_model_0.fit(X_train_lb, y_train_lb)


# 스케일링 : StandardScaler
xgb_model_1 = XGBRegressor(random_state=42,
                           n_estimators=100,   # 트리 개수
                           learning_rate=0.1)  # 학습률
xgb_model_1.fit(X_train_lb_scaled_1, y_train_lb)


# 스케일링 : MinMaxScaler
xgb_model_2 = XGBRegressor(random_state=42,
                           n_estimators=100,   # 트리 개수
                           learning_rate=0.1)  # 학습률
xgb_model_2.fit(X_train_lb_scaled_2, y_train_lb)


# 스케일링 : MinMaxScaler
xgb_model_3 = XGBRegressor(random_state=42,
                           n_estimators=100,   # 트리 개수
                           learning_rate=0.1)  # 학습률
xgb_model_3.fit(X_train_lb_scaled_3, y_train_lb)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=100, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...)

### 🟪 XGBoost 예측 및 평가

In [None]:
# 스케일링 : 없음

# 검증 데이터 예측
y_val_pred = xgb_model_0.predict(X_val_lb)

# 검증 데이터 평가
mse = mean_squared_error(y_val_lb, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_lb, y_val_pred)
r2 = r2_score(y_val_lb, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 575.7629
Validation RMSE: 23.9951
Validation MAE: 17.1268
Validation R²: 0.3839


In [None]:
# 스케일링 : StandardScaler

# 검증 데이터 예측
y_val_pred = xgb_model_1.predict(X_val_lb_scaled_1)

# 검증 데이터 평가
mse = mean_squared_error(y_val_lb, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_lb, y_val_pred)
r2 = r2_score(y_val_lb, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 575.7628
Validation RMSE: 23.9951
Validation MAE: 17.1268
Validation R²: 0.3839


In [None]:
# 스케일링 : MinMaxScaler

# 검증 데이터 예측
y_val_pred = xgb_model_2.predict(X_val_lb_scaled_2)

# 검증 데이터 평가
mse = mean_squared_error(y_val_lb, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_lb, y_val_pred)
r2 = r2_score(y_val_lb, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 575.7628
Validation RMSE: 23.9951
Validation MAE: 17.1268
Validation R²: 0.3839


In [None]:
# 스케일링 : RobustScaler

# 검증 데이터 예측
y_val_pred = xgb_model_3.predict(X_val_lb_scaled_3)

# 검증 데이터 평가
mse = mean_squared_error(y_val_lb, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_lb, y_val_pred)
r2 = r2_score(y_val_lb, y_val_pred)
print(f"Validation MSE: {mse:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 575.7628
Validation RMSE: 23.9951
Validation MAE: 17.1268
Validation R²: 0.3839
