# 한글 패키지 설치

In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  fonts-nanum
0 upgraded, 1 newly installed, 0 to remove and 38 not upgraded.
Need to get 10.3 MB of archives.
After this operation, 34.1 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 fonts-nanum all 20200506-1 [10.3 MB]
Fetched 10.3 MB in 2s (5,517 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package fonts-nanum.
(Reading database ... 126675 files and dire

In [None]:
import matplotlib.pyplot as plt
plt.rc('font', family='NanumBarunGothic')
plt.rcParams['axes.unicode_minus'] =False

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 데이터불러오기

In [None]:
df = pd.read_excel('/content/AllMerge_Semi_Final.xlsx')
df.head(5)

Unnamed: 0,TA_YM,ENCODED_MCT,YEAR,MONTH,MCT_UE_CLN_NEW_RAT,MCT_UE_CLN_REU_RAT,RC_M1_SHC_FLP_UE_CLN_RAT,MCT_OPE_MS_CN,RC_M1_SAA,RC_M1_TO_UE_CT,...,내국인지출액비율,외국인방문자수,외국인지출액비율,Y_win3_ge1,Y_win3_ge2,Y_win4_ge1,Y_win4_ge2,Y_win1_ge1,Y_win6_ge1,Y_win1_ge1_cum_na_keep
0,202301,000F03E44A,2023,1,100.0,0.0,100.0,5,6,5,...,8.0,0.1,8.05,,,,,,,
1,202302,000F03E44A,2023,2,0.0,0.0,100.0,5,6,6,...,8.85,0.1156,7.85,,,,,0.0,,0.0
2,202303,000F03E44A,2023,3,0.0,0.0,100.0,5,6,6,...,9.0,0.13975,8.3,,,,,0.0,,0.0
3,202304,000F03E44A,2023,4,50.0,0.0,0.0,5,6,5,...,9.85,0.14825,10.0,0.0,0.0,,,0.0,,0.0
4,202305,000F03E44A,2023,5,33.33,0.0,100.0,5,6,5,...,9.15,0.13155,8.45,0.0,0.0,0.0,0.0,0.0,,0.0


# 변수변환

In [None]:
# 시계열 특성 반영 위해 type 변경
df['TA_YM'] = pd.to_datetime(df['TA_YM'].astype(str), format='%Y%m')

In [None]:
df_encoded = df.copy()

In [None]:
# 범주형 원핫인코딩
categorical_cols = [
    # '상권변환',
    'CATEGORY'
]

# get_dummies()를 사용하여 더미 변수 생성
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=False, dtype=int)
df_encoded

In [None]:
df_encoded.columns

# 이상치 제거

In [None]:
import numpy as np

# '임대료' 변수의 Q1, Q3, IQR 계산
Q1 = df_encoded['임대료'].quantile(0.25)
Q3 = df_encoded['임대료'].quantile(0.75)
IQR = Q3 - Q1

# IQR을 이용한 이상치 상한/하한 경계값 계산
upper_bound = Q3 + 1.5 * IQR
lower_bound = Q1 - 1.5 * IQR

print(f"IQR 기반 '임대료'의 이상치 상한선: {upper_bound:.2f}")
print(f"IQR 기반 '임대료'의 이상치 하한선: {lower_bound:.2f}")

# numpy.clip()을 사용하여 이상치를 경계값으로 대체
df_encoded['임대료'] = np.clip(df_encoded['임대료'], lower_bound, upper_bound)

# y target 설정

In [None]:
# Y_win3_ge1 3개월 shift
df_encoded['Y_win3_ge1'] = df_encoded.groupby('ENCODED_MCT')['Y_win3_ge1'].transform(lambda x: x.shift(-3))

In [None]:
# optuna 설치
!pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/400.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.5.0


In [None]:
import optuna
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, KFold

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 1500),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 4, 20),
        "num_leaves": trial.suggest_int("num_leaves", 20, 80),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "random_state": 42,
        "verbosity": -1
    }

    model = LGBMClassifier(**params)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train_full, y_train_full, cv=kf, scoring="roc_auc")
    return scores.mean()

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40)

print("Best params:", study.best_params)

# LGBM threshold

## 상권별 train/test shape 확인

In [None]:
region_cols = ['상권변환_금호','상권변환_도선동', '상권변환_뚝섬', '상권변환_마장동',
               '상권변환_성수', '상권변환_왕십리', '상권변환_용답동']

train_shape = {}
test_shape = {}
for region in region_cols:
    print(f"\n\n=== {region} ===")
    df_region = df_encoded[df_encoded[region]==1].copy()

    test_mask = df_region['Y_win3_ge1'].isnull()
    test = df_region[test_mask].copy()
    train_full = df_region[~test_mask].copy()

    total = train_full.shape[0] + test.shape[0]
    train_ratio = train_full.shape[0] / total
    test_ratio = test.shape[0] / total

    print(f"Train data shape: {train_full.shape}", f"({train_ratio*100:.2f}%)")
    print(f"Test data shape: {test.shape}", f"({test_ratio*100:.2f}%)")

    train_shape[region] = train_full.shape[0]
    test_shape[region] = test.shape[0]

# 모든 상권 합
total_train = sum(train_shape.values())
total_test = sum(test_shape.values())
total_rows = total_train + total_test
train_ratio = np.round(total_train / total_rows,2)
test_ratio = np.round(total_test / total_rows,2)

print("\n=== 최종 shape 확인 ===")
print("총 Train rows:", total_train ,"(",train_ratio*100, "%)")
print("총 Test rows:", total_test, "(",test_ratio*100, "%)")
print("총 Train+Test rows:", total_rows)
print("원래 데이터셋 rows:", df_encoded.shape[0])


## 모델링

In [None]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import random

# 한국어 폰트 설정
font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
fontprop = fm.FontProperties(fname=font_path)
plt.rc('font', family=fontprop.get_name())
plt.rc('axes', unicode_minus=False)

# 사용할 feature 정의
col = ['YEAR', 'MONTH', 'MCT_UE_CLN_NEW_RAT', 'MCT_UE_CLN_REU_RAT', 'RC_M1_SHC_FLP_UE_CLN_RAT',
       'MCT_OPE_MS_CN', 'RC_M1_AV_NP_AT', 'M12_SME_BZN_SAA_PCE_RT', 'M12_SME_BZN_ME_MCT_RAT',
       'AGE_102030_RAT', '생활물가지수', '현재생활형편(서울)', '소비지출전망CSI(서울)', '실업률(서울)',
       'product_price', '개업률', '폐업률', '유동인구수', '임대료', '단기체류외국인', '외국인방문자수',
       '상권변환_금호','상권변환_도선동', '상권변환_뚝섬', '상권변환_마장동', '상권변환_성수', '상권변환_왕십리', '상권변환_용답동',
       'CATEGORY_세계음식', 'CATEGORY_식료품/식자재', 'CATEGORY_주점/유흥', 'CATEGORY_카페/디저트', 'CATEGORY_한식',
       'ENCODED_MCT','TA_YM']

numerical_cols = ['MCT_UE_CLN_NEW_RAT',
                  'MCT_UE_CLN_REU_RAT',
                  'RC_M1_SHC_FLP_UE_CLN_RAT',
                  'MCT_OPE_MS_CN',
                  'M12_SME_BZN_SAA_PCE_RT',
                  'M12_SME_BZN_ME_MCT_RAT',
                  'AGE_102030_RAT',
                  '생활물가지수',
                  '현재생활형편(서울)',
                  '소비지출전망CSI(서울)',
                  '실업률(서울)',
                  'product_price',
                  '개업률',
                  '폐업률',
                  '유동인구수',
                  '임대료',
                  '단기체류외국인',
                  '외국인방문자수']

drop_cols = ['ENCODED_MCT','TA_YM']
features = [c for c in col if c not in drop_cols and c != 'Y_win3_ge1']

# 리스트 생성
region_cols = ['상권변환_금호','상권변환_도선동', '상권변환_뚝섬', '상권변환_마장동',
               '상권변환_성수', '상권변환_왕십리', '상권변환_용답동']

all_metrics = {}
all_results = {}
region_importances = {}
all_test_results = []  # test 결과 저장할 리스트

# 1. 상권별 loop
for region in region_cols:
    print(f"\n\n=== {region} 모델 학습 시작 ===")
    df_region = df_encoded[df_encoded[region]==1].copy()

    test_mask = df_region['Y_win3_ge1'].isnull()
    test = df_region[test_mask].copy()
    train_full = df_region[~test_mask].copy()

    n_external_folds = 10 # fold 수 설정

    rng = np.random.default_rng(42) # id 랜덤 설정
    unique_ids = train_full['ENCODED_MCT'].unique()
    rng.shuffle(unique_ids)
    fold_size = len(unique_ids) // n_external_folds
    external_folds = [unique_ids[i*fold_size:(i+1)*fold_size] for i in range(n_external_folds)]
    external_folds[-1] = np.concatenate([external_folds[-1], unique_ids[n_external_folds*fold_size:]])

    aucs_external, f1s_external, accs_external = [], [], []
    fold_importances = []
    fold_thresholds = []  # fold별 threshold 저장

    #2. External CV(상권별)
    for ext_fold_idx in range(n_external_folds):
        print(f"\n--- External Fold {ext_fold_idx+1} ---")

        valid_ids = external_folds[ext_fold_idx]
        train_ids = np.concatenate([external_folds[i] for i in range(n_external_folds) if i != ext_fold_idx])

        train = train_full[train_full['ENCODED_MCT'].isin(train_ids)].copy()
        valid = train_full[train_full['ENCODED_MCT'].isin(valid_ids)].copy()

        train['YEAR_MONTH_INT'] = train['YEAR']*100 + train['MONTH']
        valid['YEAR_MONTH_INT'] = valid['YEAR']*100 + valid['MONTH']

        X_train, y_train = train[features].copy(), train['Y_win3_ge1']
        X_valid, y_valid = valid[features].copy(), valid['Y_win3_ge1']

        # MinMaxscaler 적용
        scaler = MinMaxScaler()
        X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
        X_valid[numerical_cols] = scaler.transform(X_valid[numerical_cols])

        train_df = X_train.copy()
        train_df['Y_win3_ge1'] = y_train
        train_df['YEAR_MONTH_INT'] = train['YEAR_MONTH_INT']
        periods = sorted(train_df['YEAR_MONTH_INT'].unique())
        n_periods = len(periods)

        internal_aucs, internal_f1s, internal_accs = [], [], []
        thresholds_internal = []

        start_idx = 0
        # rolling cv + gap param
        train_window, gap_window, val_window = 9, 3, 1

        # 3. Internal CV(Rolling + Gap)
        while start_idx + train_window + gap_window + val_window <= n_periods:
            train_periods = periods[start_idx:start_idx+train_window]
            val_periods = periods[start_idx+train_window+gap_window:start_idx+train_window+gap_window+val_window]

            train_idx = train_df[train_df['YEAR_MONTH_INT'].isin(train_periods)].index
            val_idx = train_df[train_df['YEAR_MONTH_INT'].isin(val_periods)].index
            print(val_idx)

            X_tr, y_tr = X_train.loc[train_idx], y_train.loc[train_idx]
            X_val, y_val = X_train.loc[val_idx], y_train.loc[val_idx]

            model = LGBMClassifier(
                n_estimators=500,
                learning_rate=0.05,
                max_depth=18,
                num_leaves=43,
                subsample=0.7,
                colsample_bytree=0.95,
                reg_lambda=0.34,
                reg_alpha=0.1,
                min_child_samples=59,
                class_weight='balanced',
                verbosity=-1
            )
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val, y_val)],
                eval_metric='auc',
                callbacks=[early_stopping(stopping_rounds=50), log_evaluation(period=0)]
            )
            fold_importances.append(model.feature_importances_)

            y_pred_proba = model.predict_proba(X_val)[:,1]

            # Threshold 조정
            best_f1, best_thresh = 0, 0.5
            for t in np.linspace(0.3, 0.6, 31):
                y_pred_t = (y_pred_proba > t).astype(int)
                f1 = f1_score(y_val, y_pred_t)
                if f1 > best_f1:
                    best_f1, best_thresh = f1, t
            thresholds_internal.append(best_thresh)

            y_pred_label = (y_pred_proba > best_thresh).astype(int)
            internal_aucs.append(roc_auc_score(y_val, y_pred_proba))
            internal_f1s.append(f1_score(y_val, y_pred_label))
            internal_accs.append(accuracy_score(y_val, y_pred_label))

            start_idx += 1

        fold_thresholds.append(np.mean(thresholds_internal))
        aucs_external.append(np.mean(internal_aucs))
        f1s_external.append(np.mean(internal_f1s))
        accs_external.append(np.mean(internal_accs))

    region_best_threshold = np.mean(fold_thresholds)
    print(f"\n[{region}] 최적 threshold = {region_best_threshold:.3f}")

    all_metrics[region] = {
        "AUC": np.mean(aucs_external),
        "F1": np.mean(f1s_external),
        "Acc": np.mean(accs_external),
        "Threshold": region_best_threshold
    }

    print(f"{region} 성능: AUC={np.mean(aucs_external):.4f}, F1={np.mean(f1s_external):.4f}, Acc={np.mean(accs_external):.4f}")

    # Feature importance 그래프
    mean_importances = np.mean(fold_importances, axis=0)
    importance_df = pd.DataFrame({'feature': features, 'importance': mean_importances}).sort_values(by='importance', ascending=False)
    region_importances[region] = importance_df

    plt.figure(figsize=(8,6))
    plt.barh(importance_df['feature'][:10][::-1], importance_df['importance'][:10][::-1])
    plt.title(f"Top 10 Feature Importances - {region}")
    plt.xlabel("Importance")
    plt.ylabel("Feature")
    plt.show()

    # Test data 성능 평가
    if test.shape[0] > 0:
            X_train_full, y_train_full = train_full[features].copy(), train_full['Y_win3_ge1']
            X_test = test[features].copy()
            scaler = MinMaxScaler()
            X_train_full[numerical_cols] = scaler.fit_transform(X_train_full[numerical_cols])
            X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

            # 전체 train으로 학습한 최종 모델
            final_model = LGBMClassifier(
                    n_estimators=500,
                    learning_rate=0.05,
                    max_depth=18,
                    num_leaves=43,
                    subsample=0.7,
                    colsample_bytree=0.95,
                    reg_lambda=0.34,
                    reg_alpha=0.1,
                    min_child_samples=59,
                    class_weight='balanced',
                    verbosity=-1)

            final_model.fit(X_train_full, y_train_full)

            y_test_pred_proba = final_model.predict_proba(X_test)[:,1]
            y_test_pred_label = (y_test_pred_proba > region_best_threshold).astype(int)

            df_test_result = test[['ENCODED_MCT','상권변환_금호', '상권변환_도선동', '상권변환_뚝섬',
                                '상권변환_마장동', '상권변환_성수',
                                '상권변환_왕십리', '상권변환_용답동',
                                'YEAR','MONTH','MCT_ME_D']].copy()
            df_test_result['Y_pred_proba'] = y_test_pred_proba
            df_test_result['Y_pred_label'] = y_test_pred_label

            all_test_results.append(df_test_result)

    # 전체 test 결과
    if all_test_results:
        df_test_all = pd.concat(all_test_results, axis=0).reset_index(drop=True)

print("\n=== 전체 상권별 성능 요약 ===")
for r, m in all_metrics.items():
    print(f"{r}: AUC={m['AUC']:.4f}, F1={m['F1']:.4f}, Acc={m['Acc']:.4f}, Th={m['Threshold']:.3f}")

In [None]:
df_test_all

## 결과확인

In [None]:
df_test_result=df_test_all.copy()

In [None]:
region_cols = ['상권변환_금호','상권변환_도선동', '상권변환_뚝섬', '상권변환_마장동',
               '상권변환_성수', '상권변환_왕십리', '상권변환_용답동']

def extract_region(row):
    for col in region_cols:
        if row[col] == 1:
            return col.split('_')[1]  # '_' 뒤 동만 추출
    return None

df_test_result['상권'] = df_test_result.apply(extract_region, axis=1)

In [None]:
df_test_result.drop(columns=region_cols, inplace=True)
df_test_result.drop(columns='MCT_ME_D', inplace=True)

In [None]:
# 위험 4단계 분류
def assign_risk_label(count):
    if count >= 3:
        return '위험'
    elif count == 2:
        return '위기'
    elif count == 1:
        return '경고'
    else:
        return '정상'

In [None]:
result = (
    df_test_result
    .groupby('ENCODED_MCT', as_index=False)
    .agg({
        'Y_pred_label': 'sum',
        '상권': 'first',
        'MCT_ME_D': 'max'
    })
)

# 위험도 레이블
result['risk_label'] = result['Y_pred_label'].apply(assign_risk_label)
result = result[['ENCODED_MCT','Y_pred_label', 'risk_label', '상권']]
result