In [18]:
import joblib
housing_prepared = joblib.load("data/housing_prepared.pkl")
housing_labels = joblib.load("data/housing_labels.pkl")
housing = joblib.load("data/housing_raw.pkl")

In [19]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin): # DataFrameSelector 다시 정의
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [20]:
import numpy as np

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [21]:
full_pipeline = joblib.load("models/full_pipeline.pkl")

In [22]:
from sklearn.linear_model import LinearRegression  # 선형 회귀 모델 임포트

lin_reg = LinearRegression()  # 모델 객체 생성
lin_reg.fit(housing_prepared, housing_labels)  # 훈련 데이터로 모델 학습

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [23]:
# 예측 테스트 (5개 샘플)

some_data = housing.iloc[:5]  # 원본 데이터에서 일부 샘플 추출
some_labels = housing_labels.iloc[:5]  # 실제 값 추출
some_data_prepared = full_pipeline.transform(some_data)  # 전처리 적용

print("Predictions:", lin_reg.predict(some_data_prepared))  # 예측값 출력
print("Labels:", list(some_labels))  # 실제값 출력

Predictions: [ 85657.90192014 305492.60737488 152056.46122456 186095.70946094
 244550.67966089]
Labels: [72100.0, 279600.0, 82700.0, 112500.0, 238300.0]


In [24]:
# RMSE 계산 (훈련 세트 기준)
from sklearn.metrics import mean_squared_error  # RMSE 계산용 함수

housing_predictions = lin_reg.predict(housing_prepared)  # 전체 예측
lin_mse = mean_squared_error(housing_labels, housing_predictions)  # MSE 계산
lin_rmse = np.sqrt(lin_mse)  # RMSE 계산

In [25]:
# 결정 트리 모델 훈련
from sklearn.tree import DecisionTreeRegressor  

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)  # 학습

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [26]:
# RMSE 계산 (훈련 세트 기준, 과적합 여부 확인)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)  # 훈련 데이터에 대해 RMSE가 0일 수도 있음 → 과적합 신호

In [27]:
# 교차 검증 (결정 트리)
from sklearn.model_selection import cross_val_score  # 교차 검증 함수

scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)  # 10겹 교차검증
rmse_scores = np.sqrt(-scores)  # 점수가 음수이므로 반전 후 제곱근 계산

In [28]:
# 점수 출력 함수
def display_scores(scores):  # 점수 리스트를 보기 좋게 출력
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(rmse_scores)

Scores: [72539.77884453 68940.59216088 67776.03810024 69855.30579843
 69363.68254008 76568.77537941 71173.67188701 73626.71099826
 69611.19169972 71088.78120178]
Mean: 71054.45286103345
Standard deviation: 2466.6155109537704


In [29]:
# 교차 검증 (선형 회귀 비교)
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [71762.76364394 64114.99166359 67771.17124356 68635.19072082
 66846.14089488 72528.03725385 73997.08050233 68802.33629334
 66443.28836884 70139.79923956]
Mean: 69104.07998247063
Standard deviation: 2880.3282098180666


In [None]:
# 랜덤 포레스트 훈련 및 평가
from sklearn.ensemble import RandomForestRegressor  # 랜덤 포레스트 임포트

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)  # 학습

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

In [None]:
# 모델 저장 및 재사용
import joblib  # 대용량 모델 저장에 적합한 라이브러리

joblib.dump(forest_reg, "forest_model.pkl")  # 모델 저장
model_loaded = joblib.load("forest_model.pkl")  # 저장된 모델 불러오기


In [None]:
joblib.dump(lin_reg, "models/lin_reg.pkl")