# 1. Library & Seed Setting

In [61]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import os
import requests
import pickle
import lightgbm as lgb
import joblib
import re
import math

from hyperopt import fmin, tpe, hp, Trials
from hyperopt.pyll import scope
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import KMeans
from tqdm import tqdm

plt.rcParams['font.family'] = 'NanumGothic'

In [2]:
def seed_setting(seed=1004) :
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_setting()

In [3]:
bus_data = pd.read_csv("bus_feature.csv")
subway_data = pd.read_csv("subway_feature.csv")
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

  train_data = pd.read_csv("train.csv")


In [69]:
def Entire_Preprocessing(df) :
    # 문자열 컬럼만 찾아서 좌우 공백 제거
    df = df.apply(lambda col: col.str.strip() if col.dtype == "object" else col)

    # 전화번호, 팩스번호 k-홈페이지, 고용보험관리번호, k-등록일자, k-수정일자, 관리비 업로드, 단지소개기존clob 삭제
    df = df.drop(columns=['k-전화번호', 'k-팩스번호', 'k-홈페이지', '고용보험관리번호', 'k-등록일자', 'k-수정일자', '관리비 업로드', '단지소개기존clob'])

    # 본번, 부번, 시군구 삭제
    df = df.drop(columns=['본번', '부번', '시군구'])

    # 계약년월 분해
    df['계약(연)'] = df['계약년월'] // 100
    df['계약(월)'] = df['계약년월'] % 100
    df = df.drop(columns=['계약년월'])

    # 계약일 → 계약(일)
    df = df.rename(columns={"계약일" : "계약(일)"})

    # 불필요 특성 제거
    if 'target' in df.columns :
        columns_to_keep = [
        '전용면적(㎡)', '건축년도', '해제사유발생일', 'k-연면적', 'k-전용면적별세대현황(60㎡이하)',
        'k-전용면적별세대현황(60㎡~85㎡이하)', 'k-전체세대수', '주차대수', '계약(연)', 
        '좌표X', '좌표Y', '아파트명', '등기신청일자', 'k-복도유형', 'k-단지분류(아파트,주상복합등등)', '도로명', 'target'
        ]
    else :
        columns_to_keep = [
        '전용면적(㎡)', '건축년도', '해제사유발생일', 'k-연면적', 'k-전용면적별세대현황(60㎡이하)',
        'k-전용면적별세대현황(60㎡~85㎡이하)', 'k-전체세대수', '주차대수', '계약(연)', 
        '좌표X', '좌표Y', '아파트명', '등기신청일자', 'k-복도유형', 'k-단지분류(아파트,주상복합등등)', '도로명'
        ]

    df = df[columns_to_keep]

    # 해재사유발생일 전처리
    df['해제사유발생일'] = df['해제사유발생일'].notnull().astype(int)

    # 세대당_주차대수 특성 생성
    df['세대당_주차대수'] = df.apply(
    lambda row: row['주차대수'] / row['k-전체세대수'] if pd.notnull(row['주차대수']) and pd.notnull(row['k-전체세대수']) else np.nan,
    axis=1)

    # 등기신청일자 전처리
    df['등기신청일자'] = df['등기신청일자'].notnull().astype(int)

    # 특성 이름에서 k- 빼기
    df.columns = df.columns.str.replace('k-', '')

    # 군집화
    ## 카카오 API 호출 함수
    def get_coords_kakao(address, api_key):
        url = "https://dapi.kakao.com/v2/local/search/address.json"
        headers = {"Authorization": f"KakaoAK {api_key}"}
        params = {"query": address}
        response = requests.get(url, headers=headers, params=params)
        result = response.json()
        
        try:
            x = float(result['documents'][0]['x'])
            y = float(result['documents'][0]['y'])
            return x, y
        except IndexError:
            return None, None

    ## 도로명을 기반으로 좌표X와 좌표Y를 받아옴 (결측치에 한해서)
    def fill_missing_coords(row):
        if pd.isna(row['좌표X']) or pd.isna(row['좌표Y']):
            coords = roadname_to_coords.get(row['도로명'])
            if coords:
                return pd.Series(coords)
        return pd.Series([row['좌표X'], row['좌표Y']])
    
    roadname_to_coords = {}
    unique_roads = df.loc[df[['좌표X', '좌표Y']].isnull().any(axis=1), '도로명'].dropna().unique()

    api_key = '13b7b7a0b7a853100b56c56f19f6bc24'

    for road in tqdm(unique_roads) :
        x, y = get_coords_kakao(road, api_key)
        if x is not None and y is not None :
            roadname_to_coords[road] = (x, y)

    df[['좌표X', '좌표Y']] = df.apply(fill_missing_coords, axis=1)

    return df

In [5]:
df = Entire_Preprocessing(train_data)

100%|██████████| 8441/8441 [10:03<00:00, 13.99it/s]


In [6]:
df.head(5)

Unnamed: 0,전용면적(㎡),건축년도,해제사유발생일,연면적,전용면적별세대현황(60㎡이하),전용면적별세대현황(60㎡~85㎡이하),전체세대수,주차대수,계약(연),좌표X,좌표Y,아파트명,등기신청일자,복도유형,"단지분류(아파트,주상복합등등)",도로명,target,세대당_주차대수
0,79.97,1987,0,22637.0,20.0,250.0,270.0,262.0,2017,127.05721,37.476763,개포6차우성,1,계단식,아파트,언주로 3,124000,0.97037
1,79.97,1987,0,22637.0,20.0,250.0,270.0,262.0,2017,127.05721,37.476763,개포6차우성,1,계단식,아파트,언주로 3,123500,0.97037
2,54.98,1987,0,22637.0,20.0,250.0,270.0,262.0,2017,127.05721,37.476763,개포6차우성,1,계단식,아파트,언주로 3,91500,0.97037
3,79.97,1987,0,22637.0,20.0,250.0,270.0,262.0,2018,127.05721,37.476763,개포6차우성,1,계단식,아파트,언주로 3,130000,0.97037
4,79.97,1987,0,22637.0,20.0,250.0,270.0,262.0,2018,127.05721,37.476763,개포6차우성,1,계단식,아파트,언주로 3,117000,0.97037


In [7]:
df['좌표X'].isnull().sum()

22191

In [8]:
df = df.dropna(subset=['좌표X', '좌표Y'])

In [11]:
# 군집화
kmeans = joblib.load('kmeans_model.pkl')

df['cluster'] = kmeans.predict(df[['좌표X', '좌표Y']])
df = df.drop(columns=['좌표X', '좌표Y'])



In [12]:
df.head(3)

Unnamed: 0,전용면적(㎡),건축년도,해제사유발생일,연면적,전용면적별세대현황(60㎡이하),전용면적별세대현황(60㎡~85㎡이하),전체세대수,주차대수,계약(연),아파트명,등기신청일자,복도유형,"단지분류(아파트,주상복합등등)",도로명,target,세대당_주차대수,cluster
0,79.97,1987,0,22637.0,20.0,250.0,270.0,262.0,2017,개포6차우성,1,계단식,아파트,언주로 3,124000,0.97037,3
1,79.97,1987,0,22637.0,20.0,250.0,270.0,262.0,2017,개포6차우성,1,계단식,아파트,언주로 3,123500,0.97037,3
2,54.98,1987,0,22637.0,20.0,250.0,270.0,262.0,2017,개포6차우성,1,계단식,아파트,언주로 3,91500,0.97037,3


In [13]:
df.to_csv('preprocessed_data.csv', index=False)

# 2. Modeling

In [40]:
df = pd.read_csv('preprocessed_data.csv')

In [51]:
def clean_column_name(name):
    return re.sub(r'[^\uac00-\ud7a3a-zA-Z0-9_]', '_', name)

df.columns = [clean_column_name(col) for col in df.columns]

In [None]:
# object 타입 특성 변환
categorical_features = df.select_dtypes(include=['object']).columns

for col in categorical_features:
    df[col] = df[col].astype('category')

In [53]:
X = df.drop(columns=['target'])
y = np.log1p(df['target'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
X_train.head(1)

Unnamed: 0,전용면적___,건축년도,해제사유발생일,연면적,전용면적별세대현황_60_이하_,전용면적별세대현황_60__85_이하_,전체세대수,주차대수,계약_연_,아파트명,등기신청일자,복도유형,단지분류_아파트_주상복합등등_,도로명,세대당_주차대수,cluster
326068,74.2,1989,0,,,,,,2020,경남,1,,,섬밭로 265,,2


In [55]:
# 수치형 특성 스케일링(Data Leakage 방지 적용)
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns

scaler = RobustScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

In [56]:
model = LGBMRegressor()

param_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 300, 3000, 10)),
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.2),
    'num_leaves' : scope.int(hp.quniform('num_leaves', 2, 50, 1)),
    'max_depth': scope.int(hp.quniform('max_depth', 0, 40, 1)),
    'min_data_in_leaf' : scope.int(hp.quniform('min_data_in_leaf', 0, 50, 1)),
    'feature_fraction_bynode' : hp.uniform('feature_fraction_bynode', 0.001, 1.0),
    'bagging_fraction' : hp.uniform('bagging_fraction', 0.001, 1.0),
    'bagging_freq' : scope.int(hp.quniform('bagging_freq', 0, 30, 1)),
    'min_child_weight': hp.uniform('min_child_weight', 0, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0, 1),
    'drop_rate' : hp.uniform('drop_rate', 0, 1)
}

In [59]:
def objective(params):
    lgb_model = lgb.LGBMRegressor(
        n_estimators=params['n_estimators'],
        learning_rate=params['learning_rate'],
        num_leaves=params['num_leaves'],
        max_depth=params['max_depth'],
        min_data_in_leaf=params['min_data_in_leaf'],
        feature_fraction_bynode=params['feature_fraction_bynode'],
        bagging_fraction=params['bagging_fraction'],
        bagging_freq=params['bagging_freq'],
        min_child_weight=params['min_child_weight'],
        reg_alpha=params['reg_alpha'],
        reg_lambda=params['reg_lambda'],
        drop_rate=params['drop_rate'],
    )
    
    lgb_model.fit(X_train, y_train)
    
    lgb_pred = lgb_model.predict(X_test)
    
    mse = mean_squared_error(y_test, lgb_pred)
    rmse = math.sqrt(mse)
    
    return rmse

In [62]:
trials = Trials()
best = fmin(fn=objective, space=param_space, algo=tpe.suggest, max_evals=50, trials=trials)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008014 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12619                    
[LightGBM] [Info] Number of data points in the train set: 877304, number of used features: 15
[LightGBM] [Info] Start training from score 10.744788 
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008929 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12619                                              
[LightGBM] [Info] Number of data points in the train set: 877304, number of used features: 15
[LightGBM] [Info] Start training from score 10.744788                           
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.

In [63]:
print(f"Best Hyperparameters: {best}")

Best Hyperparameters: {'bagging_fraction': 0.4427685182978106, 'bagging_freq': 0.0, 'drop_rate': 0.3733566819120723, 'feature_fraction_bynode': 0.4394248702328686, 'learning_rate': 0.18515107173944523, 'max_depth': 13.0, 'min_child_weight': 1.1962130427990936, 'min_data_in_leaf': 47.0, 'n_estimators': 2170.0, 'num_leaves': 45.0, 'reg_alpha': 0.5191578684886946, 'reg_lambda': 0.040577397414599314}


In [64]:
best_model = lgb.LGBMRegressor(
    n_estimators=int(best['n_estimators']),
    learning_rate=best['learning_rate'],
    num_leaves=int(best['num_leaves']),
    max_depth=int(best['max_depth']),
    min_data_in_leaf=int(best['min_data_in_leaf']),
    feature_fraction_bynode=best['feature_fraction_bynode'],
    bagging_fraction=best['bagging_fraction'],
    bagging_freq=int(best['bagging_freq']),
    min_child_weight=best['min_child_weight'],
    reg_alpha=best['reg_alpha'],
    reg_lambda=best['reg_lambda'],
    drop_rate=best['drop_rate'],
)

best_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011954 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12619
[LightGBM] [Info] Number of data points in the train set: 877304, number of used features: 15
[LightGBM] [Info] Start training from score 10.744788


In [66]:
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)
rmse



0.09185924386043022

In [67]:
feature_importance = best_model.feature_importances_

# 특성 중요도를 데이터프레임으로 정리
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
})

# 중요도가 높은 순으로 정렬
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df

Unnamed: 0,Feature,Importance
13,도로명,21654
8,계약_연_,20563
9,아파트명,20063
0,전용면적___,19912
1,건축년도,6475
15,cluster,3873
3,연면적,572
4,전용면적별세대현황_60_이하_,538
6,전체세대수,438
14,세대당_주차대수,429


# 3. Submission

In [70]:
test = Entire_Preprocessing(test_data)

  0%|          | 0/2052 [00:00<?, ?it/s]

100%|██████████| 2052/2052 [02:29<00:00, 13.71it/s]


In [None]:
test['좌표X'].isna()

Unnamed: 0,전용면적___,건축년도,해제사유발생일,연면적,전용면적별세대현황_60_이하_,전용면적별세대현황_60__85_이하_,전체세대수,주차대수,계약_연_,좌표X,좌표Y,아파트명,등기신청일자,복도유형,단지분류_아파트_주상복합등등_,도로명,세대당_주차대수
0,79.97,1987,0,22637.0,20.0,250.0,270.0,262.0,2023,127.05721,37.476763,개포6차우성,1,계단식,아파트,언주로 3,0.97037


In [None]:
coord_mask = test[['좌표X', '좌표Y']].notna().all(axis=1)

test['cluster'] = 'Nan'

test['cluster'] = kmeans.predict(test[['좌표X', '좌표Y']])



ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [89]:
def test_preprocessing(df) :
    df.columns = [clean_column_name(col) for col in df.columns]
    categorical_features = df.select_dtypes(include=['object']).columns

    for col in categorical_features:
        df[col] = df[col].astype('category')
        
    df[numeric_features] = scaler.transform(df[numeric_features])
    return df

In [88]:
test = test_preprocessing(test)



ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [79]:
test.head(1)

Unnamed: 0,전용면적(㎡),건축년도,해제사유발생일,연면적,전용면적별세대현황(60㎡이하),전용면적별세대현황(60㎡~85㎡이하),전체세대수,주차대수,계약(연),좌표X,좌표Y,아파트명,등기신청일자,복도유형,"단지분류(아파트,주상복합등등)",도로명,세대당_주차대수
0,79.97,1987,0,22637.0,20.0,250.0,270.0,262.0,2023,127.05721,37.476763,개포6차우성,1,계단식,아파트,언주로 3,0.97037


In [71]:
pred = best_model.predict(test)

ValueError: Number of features of the model must match the input. Model n_features_ is 16 and input n_features is 17