In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 맷플롯립 한글 사용 가능
import matplotlib.pyplot as plt
plt.rcParams['font.family'] ='Malgun Gothic'
plt.rcParams['axes.unicode_minus'] =False

# 모델링
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
apt_df = pd.read_csv('Data_Preprocessing/apt_all_numeric_information.csv')

In [3]:
apt_df.columns

Index(['NO', 'latitude', 'longitude', 'brand_label_encoded',
       'exclusive_area_m2', 'apt_age', 'floor', 'closest_station_dist_km',
       'station_score', 'closest_bus_stop_dist_km', 'bus_stop_score',
       'closest_school_dist_km', 'school_score', 'closest_hospital_dist_km',
       'hospital_score', 'closest_park_dist_km', 'park_score',
       'ldong_prev_1_month', 'ldong_prev_3_months', 'ldong_prev_6_months',
       'K_progressive_president', 'A_progressive_president',
       'A_progressive_senate', 'A_progressive_house', 'USD_exchange_rate',
       'CNY_exchange_rate', 'JPY_exchange_rate', '한국은행_기준금리', '정부대출금금리',
       '무역금융지원_프로그램대출금리', '영세자영업자지원_프로그램대출금리', '신성장·일자리지원_프로그램대출금리',
       '설비투자지원_프로그램대출금리', '지방중소기업지원_프로그램대출금리', '자금조정_대출금리', '자금조정_예금금리',
       'adjusted_income', '월_평균_소득_금액', '식료품_지출_총금액_퍼센트', '의류_신발_지출_총금액_퍼센트',
       '생활용품_지출_총금액_퍼센트', '의료비_지출_총금액_퍼센트', '교통_지출_총금액_퍼센트', '교육_지출_총금액_퍼센트',
       '유흥_지출_총금액_퍼센트', '여가_문화_지출_총금액_퍼센트', '기타_지출_총금액_퍼센트', '음식_지출_총금액_

In [4]:
apt_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251096 entries, 0 to 251095
Data columns (total 57 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   NO                        251096 non-null  object 
 1   latitude                  251096 non-null  float64
 2   longitude                 251096 non-null  float64
 3   brand_label_encoded       251096 non-null  float64
 4   exclusive_area_m2         251096 non-null  float64
 5   apt_age                   251096 non-null  float64
 6   floor                     251096 non-null  float64
 7   closest_station_dist_km   251096 non-null  float64
 8   station_score             251096 non-null  float64
 9   closest_bus_stop_dist_km  251096 non-null  float64
 10  bus_stop_score            251096 non-null  float64
 11  closest_school_dist_km    251096 non-null  float64
 12  school_score              251096 non-null  float64
 13  closest_hospital_dist_km  251096 non-null  f

### 1. 연도별 데이터 샘플링 및 전처리

### 2. 데이터 분리

In [33]:
# 1. 타겟 변수 및 피처 정의
X = sampled_df.drop(columns=['NO', 'adjusted_price', 'year'])  # NO, 타겟, year 제외
y = sampled_df['adjusted_price']  # 타겟 변수

# 2. 학습 데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")


Train shape: (12000, 21), Test shape: (3000, 21)


### 3. 선형 회귀 모델링

In [38]:
# 1. 모델 초기화 및 학습
model = LinearRegression()
model.fit(X_train, y_train)

# 2. 예측
y_pred = model.predict(X_test)

# 3. 성능 평가
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R2 Score: {r2:.2f}")
print(f"Linear Regression RMSE: {rmse:.2f}")

Mean Squared Error: 2784471325.20
R2 Score: 0.58
Linear Regression RMSE: 52768.09


### 4. 회귀 계수 확인

In [35]:
# 회귀 계수 출력
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
}).sort_values(by='Coefficient', ascending=False)

print(coefficients)


                     Feature    Coefficient
12                 longitude  103038.620268
2       closest_park_dist_km   10444.307509
15       brand_label_encoded    9995.879024
1   closest_bus_stop_dist_km    5086.884408
18                       Q_3    2954.124709
13         exclusive_area_m2    1289.887675
17                       Q_2    1240.634246
14                     floor     982.624388
7                 park_score     740.436757
20                   apt_age     309.770457
5              station_score     305.983598
10               total_score     263.079164
9               school_score    -185.288084
8             hospital_score    -250.679135
6             bus_stop_score    -347.373972
19                       Q_4    -705.261640
16                       Q_1   -3489.497316
3   closest_hospital_dist_km   -5855.836515
4     closest_school_dist_km   -8218.372135
0    closest_station_dist_km  -14014.411617
11                  latitude -310762.508191


### 개선시도

In [39]:
# 데이터 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 랜덤 포레스트 모델로 학습
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# 예측 및 성능 평가
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)

print(f"Random Forest - MSE: {mse_rf:.2f}, R2 Score: {r2_rf:.2f}")
print(f"Random Forest RMSE: {rmse_rf:.2f}")

Random Forest - MSE: 788283564.01, R2 Score: 0.88
Random Forest RMSE: 28076.39


### 다른 모델들

In [44]:
# 년도별 샘플링
def sample_by_year(df, sample_size=3000):
    # 년도 추출 (연도 컬럼이 있다고 가정)
    years = df['year'].unique()
    
    # 각 년도별로 샘플링
    sampled_dfs = []
    for year in years:
        year_df = df[df['year'] == year]
        sampled_df = year_df.sample(n=min(sample_size, len(year_df)), random_state=42)
        sampled_dfs.append(sampled_df)
    
    # 샘플링된 데이터프레임 합치기
    sampled_data = pd.concat(sampled_dfs, ignore_index=True)
    
    return sampled_data

# 샘플링 적용
sampled_apt_df = sample_by_year(selected_apt_df)

# 샘플링 결과 확인
print("원본 데이터 크기:", len(selected_apt_df))
print("샘플링된 데이터 크기:", len(sampled_apt_df))

# 각 년도별 샘플 수 확인
print("\n년도별 샘플 수:")
print(sampled_apt_df['year'].value_counts())

# 이후 모델 훈련에 sampled_apt_df 사용
X = sampled_apt_df.drop(['NO', 'adjusted_price'], axis=1)
y = sampled_apt_df['adjusted_price']

# 이전 코드와 동일하게 train_test_split, 모델 훈련 진행

원본 데이터 크기: 251096
샘플링된 데이터 크기: 15000

년도별 샘플 수:
year
2019    3000
2020    3000
2021    3000
2022    3000
2023    3000
Name: count, dtype: int64


In [45]:
# 1. 샘플링 코드 먼저 실행
sampled_apt_df = sample_by_year(selected_apt_df)

# 2. 이어서 모델 훈련 코드 실행 (그대로 사용)
X = sampled_apt_df.drop(['NO', 'adjusted_price'], axis=1)
y = sampled_apt_df['adjusted_price']

# (이후 코드는 이전에 제공한 모델 훈련 코드와 동일)
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# MinMaxScaler로 스케일링
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 모델 딕셔너리 생성
models = {
    'KNN': KNeighborsRegressor(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(random_state=42),
    'GradientBoosting': GradientBoostingRegressor(random_state=42),
    'SVM': SVR(),
}

# XGBoost 모델 추가 (xgboost 설치 필요)
try:
    models['XGBoost'] = xgb.XGBRegressor(random_state=42)
except:
    print("XGBoost 모델을 추가하지 못했습니다. xgboost 라이브러리를 설치해주세요.")

# 결과 저장할 딕셔너리
results = {}

# 모델별 학습 및 평가
for name, model in models.items():
    # 모델 학습
    model.fit(X_train_scaled, y_train)
    
    # 예측
    y_pred = model.predict(X_test_scaled)
    
    # 평가 지표 계산
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    # 결과 저장
    results[name] = {
        'MSE': mse,
        'RMSE': rmse,
        'R2 Score': r2
    }

# 결과 출력
for name, metrics in results.items():
    print(f"{name} 모델:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.2f}")
    print()

KNN 모델:
  MSE: 1977557599.23
  RMSE: 44469.74
  R2 Score: 0.70

Ridge 모델:
  MSE: 2738217496.15
  RMSE: 52327.98
  R2 Score: 0.59

Lasso 모델:
  MSE: 2735159123.43
  RMSE: 52298.75
  R2 Score: 0.59

DecisionTree 모델:
  MSE: 1570751263.07
  RMSE: 39632.70
  R2 Score: 0.76

RandomForest 모델:
  MSE: 713418883.01
  RMSE: 26709.90
  R2 Score: 0.89

GradientBoosting 모델:
  MSE: 1096338843.31
  RMSE: 33111.01
  R2 Score: 0.84

SVM 모델:
  MSE: 7007868118.54
  RMSE: 83713.01
  R2 Score: -0.05

XGBoost 모델:
  MSE: 698529289.96
  RMSE: 26429.70
  R2 Score: 0.89

