## 과정

### 전체 회귀 분석 과정
- 1.선형 회귀 모델 학습/예측/평가 및 회귀계수 시각화(릿지, 라쏘)
    - 1.1 회귀계수 시각화
    - 1.2 5폴드 교차 검증
    - 1.3 리지/라쏘 모델에 대해 alpha 하이퍼파라미터 튜닝 후 재학습/예측/평가
    - 1.4 튜닝된 모델 회귀 계수 시각화
- 2.회귀 트리 모델 학습/예측/평가 및 회귀계수 시각화(XGBoost, LGBM)
- 3.회귀 모델의 예측 결과 혼합을 통한 최종 예측
    - 3.1 릿지와 랏소 모델 예측 결과 혼합
    - 3.2 XGBoost와 LGBM 모델 예측결과 혼합
- 4.스태킹 앙상블 모델을 통한 회귀 예측

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline
from matplotlib import font_manager, rc

path = "c:/Windows/Fonts/malgun.ttf"
font_name = font_manager.FontProperties(fname=path).get_name()
rc('font', family=font_name)

# Target 값 : price

In [2]:
car_df_ohe= pd.read_csv('./data/최종 전처리2.csv')
car_test= pd.read_csv('./data/test_data.csv')
car_df_ohe.head()

Unnamed: 0,km,year,accident,price,wd,trim,brand_BMW,brand_기아,brand_랜드로버,brand_르노삼성,...,type_LPG,type_가솔린,type_디젤,type_바이퓨얼,type_전기,type_하이브리드,color_검정색,color_기타,color_회색,color_흰색
0,110575,11,3,6.47851,0,1,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
1,106819,8,3,6.846943,0,1,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
2,29151,3,3,7.544861,0,1,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
3,8766,1,3,8.06809,0,1,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
4,101196,10,3,6.710523,0,2,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0


### 1. 선형 회귀 모델 학습/예측/평가 및 회귀계수 시각화(릿지, 라쏘)

LinearRegression, Ridge, Lasso를 이용해 선형 계열의 회귀 모델 만들기

**RMSE 평가 함수 생성**

In [3]:
# 단일 모델의 RMSE 값 반환
def get_rmse(model): # 학습된 모델을 받아서 예측
    pred = model.predict(X_test)
    mse = mean_squared_error(y_test , pred)
    rmse = np.sqrt(mse)
    R2 = r2_score(y_test, pred)
    print('\n{0} 로그 변환된 RMSE: {1}'.format(model.__class__.__name__,np.round(rmse, 3)))
    print('{0} R2: {1}'.format(model.__class__.__name__,np.round(R2, 3)))
    return rmse

# 여러 모델의 RMSE 값 반환
def get_rmses(models) :
    rmses = []
    for model in models :
        rmse = get_rmse(model)
        rmses.append(rmse)
    return rmses

In [4]:
## 데이터 분할
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# 타깃과 피처
y_target = car_df_ohe['price']
X_features = car_df_ohe.drop('price',axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=156)

**모델 학습/예측/평가**

In [5]:
# 일반 선형 회귀
lr_reg = LinearRegression() 
lr_reg.fit(X_train, y_train)

# 릿찌회귀
ridge_reg = Ridge()
ridge_reg.fit(X_train, y_train)

# 라쏘 회귀
lasso_reg=Lasso()
lasso_reg.fit(X_train,y_train)


Lasso()

### 1.2 5폴드 교차 검증으로 평균 RMSE 측정
- 함수 생성

In [6]:
from sklearn.model_selection import cross_val_score

def get_avg_rmse_cv(models) :
    
    for model in models :
        # 분할하지 않고 전체 데이터세트로 교차검증 수행
        rmse_list = np.sqrt(-cross_val_score(model, X_features, y_target,
                                             scoring="neg_mean_squared_error", cv=5))
        r2_list = cross_val_score(model, X_features, y_target,
                                             scoring="r2", cv=5)
        rmse_avg = np.mean(rmse_list)
        r2_avg = np.mean(r2_list)
        
        print('\n{0} CV RMSE 값 리스트: {1}'.format( model.__class__.__name__, np.round(rmse_list, 3)))
        print('{0} CV R2 값 리스트: {1}'.format( model.__class__.__name__, np.round(r2_list, 3)))
        print('{0} CV 평균 RMSE 값: {1}'.format( model.__class__.__name__, np.round(rmse_avg, 3)))
        print('{0} CV 평균 R2 값: {1}'.format( model.__class__.__name__, np.round(r2_avg, 3)))

### 1.3 리지/라쏘 모델에 대해 alpha 하이퍼파라미터 튜닝 후 재학습/예측/평가

In [7]:
from sklearn.model_selection import GridSearchCV
# 모델과 하이퍼 파라미터 딕셔너리 객체를 받아서
# 최적화 작업의 결과를 표시하는 함수
# 릿지 모델과 라쏘 모델의 최적화 alpha 값 추출
def get_best_params(model, params):
    grid_model = GridSearchCV(model, param_grid=params, 
                              scoring='neg_mean_squared_error', cv=5)
    grid_model_r2 = GridSearchCV(model, param_grid=params, 
                              scoring='r2', cv=5)
    grid_model.fit(X_features, y_target)
    grid_model_r2.fit(X_features, y_target)
    rmse = np.sqrt(-1* grid_model.best_score_)
    r2 = grid_model_r2.best_score_
    print('\n{0} 5 CV 시 최적 평균 RMSE 값: {1}, 최적 alpha:{2}'.format(model.__class__.__name__,
                                        np.round(rmse, 4), grid_model.best_params_))
    print('{0} 5 CV 시 최적 평균 R2 값: {1}, 최적 alpha:{2}'.format(model.__class__.__name__,
                                        np.round(r2, 4), grid_model_r2.best_params_))
    return grid_model.best_estimator_ # 최적 파라미터로 재학습된 모델 반환


**분할된 트레인 테스트 데이터를 이용해 학습 후 평가**

In [8]:
lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)

ridge_reg = Ridge(alpha=20)
ridge_reg.fit(X_train, y_train)

lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)

Lasso(alpha=0.001)

### 2. 회귀 트리 모델 학습/예측/평가 및 회귀계수 시각화(XGBoost, LGBM)


- XGBoost와 LightGBM 학습/예측/평가 

In [9]:
# XGBoost와 LightGBM 모두 수행 시간이 오래 걸릴 수 있는 관계로
# 하이퍼 파라미터 설정을 미리 적용한 상태로
# 5 폴드 세트에 대한 평균 RMSE 값 추출

from xgboost import XGBRegressor

xgb_params = {'n_estimators':[1000]}

xgb_reg = XGBRegressor(n_estimators=1000, learning_rate=0.05,
                      colsample_bytree=0.5, subsample=0.8)

best_xgb = get_best_params(xgb_reg, xgb_params)


XGBRegressor 5 CV 시 최적 평균 RMSE 값: 0.3904, 최적 alpha:{'n_estimators': 1000}
XGBRegressor 5 CV 시 최적 평균 R2 값: 0.7159, 최적 alpha:{'n_estimators': 1000}


In [10]:
# 이번에는 LightGBM 회귀 트리 적용

from lightgbm import LGBMRegressor

lgbm_params = {'n_estimators':[1000]}
lgbm_reg = LGBMRegressor(n_estimators=1000, learning_rate=0.05, num_leaves=4, 
                         subsample=0.6, colsample_bytree=0.4, reg_lambda=10, n_jobs=-1)
best_lgbm = get_best_params(lgbm_reg, lgbm_params)


LGBMRegressor 5 CV 시 최적 평균 RMSE 값: 0.4047, 최적 alpha:{'n_estimators': 1000}
LGBMRegressor 5 CV 시 최적 평균 R2 값: 0.6896, 최적 alpha:{'n_estimators': 1000}


### 3.회귀 모델의 예측 결과 혼합을 통한 최종 예측

In [11]:
# 최종 혼합 모델과 개별 모델의 RMSE 값 출력하는 함수 생성
def get_rmse_pred(preds):
    for key in preds.keys():
        pred_value = preds[key]
        mse = mean_squared_error(y_test, pred_value)
        rmse = np.sqrt(mse)
        R2 = r2_score(y_test, pred_value)
        print('\n{0} 모델의 RMSE: {1}'.format(key, np.round(rmse, 3)))
        print('{0} 모델의 R2: {1}'.format(key, np.round(R2, 3)))

In [12]:
# 개별 모델 학습
ridge_reg = Ridge(alpha=20)
ridge_reg.fit(X_train, y_train)
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)

# 개별모델 예측
ridge_pred = ridge_reg.predict(X_test)
lasso_pred = lasso_reg.predict(X_test)

#### 3.2 XGBoost와 LGBM 모델 예측결과 혼합 (각 50%)

In [13]:
xgb_reg = XGBRegressor(n_estimators=1000, learning_rate=0.05, 
                       colsample_bytree=0.5, subsample=0.8)
xgb_reg.fit(X_train, y_train)
xgb_pred = xgb_reg.predict(X_test)

In [14]:
pred = {'XGBM': xgb_pred}
get_rmse_pred(pred)


XGBM 모델의 RMSE: 0.13
XGBM 모델의 R2: 0.965


In [15]:
X_train

Unnamed: 0,km,year,accident,wd,trim,brand_BMW,brand_기아,brand_랜드로버,brand_르노삼성,brand_미니,...,type_LPG,type_가솔린,type_디젤,type_바이퓨얼,type_전기,type_하이브리드,color_검정색,color_기타,color_회색,color_흰색
28350,78848,6,3,1,1,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
5207,78432,3,3,0,3,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
26569,17440,2,3,0,2,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
60660,73539,7,3,1,2,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
18330,7279,1,3,0,2,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63039,36619,2,3,0,3,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7653,10157,1,3,1,2,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
42402,18257,2,3,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
39628,17906,3,3,0,3,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [16]:
import joblib
joblib.dump(xgb_reg, open('XGBoost 최종모델2.pkl', 'wb'))

In [22]:
model = joblib.load('XGBoost 최종모델2.pkl')
price = model.predict(car_test)
np.expm1(price)

array([1248.3774], dtype=float32)

In [21]:
price

array([7.1304007], dtype=float32)

In [17]:
np.expm1(y_target).head()

0     650.0
1     940.0
2    1890.0
3    3190.0
4     820.0
Name: price, dtype: float64

In [18]:
car_test

Unnamed: 0,km,year,accident,wd,trim,brand_BMW,brand_기아,brand_랜드로버,brand_르노삼성,brand_미니,...,type_LPG,type_가솔린,type_디젤,type_바이퓨얼,type_전기,type_하이브리드,color_검정색,color_기타,color_회색,color_흰색
0,100000,3,2,0,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1


In [19]:
test

NameError: name 'test' is not defined

In [None]:
test=X_features.head()
test2 =X_features.tail()

In [None]:
value = xgb_reg.predict(car_test)
np.expm1(value)

In [None]:
np.expm1(y_target).tail()

In [None]:
xgb_pred2 = xgb_reg.predict(test2)
value2 = xgb_reg.predict(test2)
np.expm1(value2)

In [None]:
np.expm1(y_target).head()