In [27]:
import joblib
import numpy as np

housing_prepared = joblib.load("data/housing_prepared.pkl")
housing_labels = joblib.load("data/housing_labels.pkl")
housing = joblib.load("data/housing_raw.pkl") 
strat_train_set = joblib.load("data/strat_train_set.pkl")
strat_test_set = joblib.load("data/strat_test_set.pkl")

In [34]:
class DataFrameSelector(BaseEstimator, TransformerMixin):  # 재정의
    def __init__(self, attribute_names):  # 선택할 열 이름 목록을 초기화
        self.attribute_names = attribute_names
    def fit(self, X, y=None):  # fit 메서드 (여기선 학습 필요 없음)
        return self
    def transform(self, X):  # 지정된 열만 NumPy 배열 형태로 추출하여 반환
        return X[self.attribute_names] # .values 제거

In [28]:
from sklearn.base import BaseEstimator, TransformerMixin  # 재정의

# 열 인덱스 설정
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6  # 순서대로 total_rooms, total_bedrooms, population, households 열의 인덱스

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):  # 파생 특성 생성용 사용자 정의 변환기 클래스 정의
    def __init__(self, add_bedrooms_per_room=True):  # bedrooms_per_room 특성을 추가할지 여부를 설정하는 하이퍼파라미터
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):  # fit 메서드는 아무 작업 없이 self 반환 (필수 메서드)
        return self
    def transform(self, X, y=None):  # transform 메서드에서 새로운 파생 특성들을 계산하여 기존 데이터에 추가
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]  # 세 개의 파생 특성 추가
        else:
            return np.c_[X, rooms_per_household, population_per_household]  # 두 개의 파생 특성만 추가

In [29]:
full_pipeline = joblib.load("models/full_pipeline.pkl")

In [2]:
from sklearn.model_selection import GridSearchCV  # 하이퍼파라미터 튜닝을 위한 GridSearchCV 임포트
from sklearn.ensemble import RandomForestRegressor  # 랜덤 포레스트 회귀 모델 임포트

param_grid = [  # 탐색할 하이퍼파라미터 조합 정의
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

forest_reg = RandomForestRegressor()  # 모델 객체 생성

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,  # 그리드 탐색 객체 생성
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(housing_prepared, housing_labels)  # 전체 훈련 세트로 학습 및 교차 검증

0,1,2
,estimator,RandomForestRegressor()
,param_grid,"[{'max_features': [2, 4, ...], 'n_estimators': [3, 10, ...]}, {'bootstrap': [False], 'max_features': [2, 3, ...], 'n_estimators': [3, 10]}]"
,scoring,'neg_mean_squared_error'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,n_estimators,30
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,6
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [3]:
grid_search.best_params_  # 가장 좋은 하이퍼파라미터 조합 출력

{'max_features': 6, 'n_estimators': 30}

In [4]:
grid_search.best_estimator_  # 가장 성능이 좋은 모델 출력

0,1,2
,n_estimators,30
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,6
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [7]:
cvres = grid_search.cv_results_  # 교차 검증 결과 저장
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)  # RMSE와 하이퍼파라미터 조합 출력

63829.939059391814 {'max_features': 2, 'n_estimators': 3}
55467.79031291937 {'max_features': 2, 'n_estimators': 10}
52448.89836856031 {'max_features': 2, 'n_estimators': 30}
61247.57814816462 {'max_features': 4, 'n_estimators': 3}
52730.253668176825 {'max_features': 4, 'n_estimators': 10}
50243.7870321875 {'max_features': 4, 'n_estimators': 30}
58834.32843892199 {'max_features': 6, 'n_estimators': 3}
52355.98592222204 {'max_features': 6, 'n_estimators': 10}
50159.47922309745 {'max_features': 6, 'n_estimators': 30}
59480.36056451964 {'max_features': 8, 'n_estimators': 3}
52217.62089528725 {'max_features': 8, 'n_estimators': 10}
50405.00452899159 {'max_features': 8, 'n_estimators': 30}
62825.621836701466 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
53819.85638393671 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
59860.727582171916 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52716.888670658176 {'bootstrap': False, 'max_features': 3, 'n_estimators'

In [8]:
feature_importances = grid_search.best_estimator_.feature_importances_  # 특성 중요도 추출

In [12]:
from sklearn.preprocessing import OneHotEncoder

housing_cat = housing[["ocean_proximity"]]
cat_encoder = OneHotEncoder(sparse_output=False)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)

cat_one_hot_attribs = list(cat_encoder.get_feature_names_out())

In [14]:
# 수치형 열만 추출하고 열 이름 리스트 저장
housing_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)

In [15]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]  # 파생 변수 이름
cat_one_hot_attribs = list(cat_encoder.get_feature_names_out())  # 원-핫 인코딩된 범주형 변수 이름
attributes = num_attribs + extra_attribs + cat_one_hot_attribs  # 전체 특성 이름 합치기

In [16]:
sorted(zip(feature_importances, attributes), reverse=True)  # 특성 중요도 내림차순 정렬

[(np.float64(0.32685876556909504), 'median_income'),
 (np.float64(0.14048363610744594), 'ocean_proximity_INLAND'),
 (np.float64(0.11085632356611279), 'pop_per_hhold'),
 (np.float64(0.07950725288734087), 'longitude'),
 (np.float64(0.07232337985269148), 'latitude'),
 (np.float64(0.07217616020992847), 'bedrooms_per_room'),
 (np.float64(0.06334760549972474), 'rooms_per_hhold'),
 (np.float64(0.04226331455660008), 'housing_median_age'),
 (np.float64(0.0177507947835597), 'total_rooms'),
 (np.float64(0.016971532448643627), 'population'),
 (np.float64(0.016759713222679485), 'total_bedrooms'),
 (np.float64(0.01631477428391723), 'households'),
 (np.float64(0.0141568776985817), 'ocean_proximity_<1H OCEAN'),
 (np.float64(0.0058916345782066195), 'ocean_proximity_NEAR OCEAN'),
 (np.float64(0.004303106098504391), 'ocean_proximity_NEAR BAY'),
 (np.float64(3.51286369679032e-05), 'ocean_proximity_ISLAND')]

In [35]:
from sklearn.metrics import mean_squared_error  # RMSE 계산을 위한 모듈 임포트

final_model = grid_search.best_estimator_  # 최종 모델 지정

X_test = strat_test_set.drop("median_house_value", axis=1)  # 테스트 입력 특성
y_test = strat_test_set["median_house_value"].copy()  # 테스트 타깃 레이블

X_test_prepared = full_pipeline.transform(X_test)  # 전처리 파이프라인 적용
final_predictions = final_model.predict(X_test_prepared)  # 예측 수행

final_mse = mean_squared_error(y_test, final_predictions)  # MSE 계산
final_rmse = np.sqrt(final_mse)  # RMSE 계산
print(final_rmse)  # RMSE 출력

47713.363617950614




In [36]:
joblib.dump(grid_search, "models/grid_search.pkl")

['models/grid_search.pkl']