# 뷰티 + 음식 sns 언급 빈도에 따른 지역 소비금액 성장/하락률

In [308]:
import numpy as np
import pandas as pd
import glob

In [309]:
# 푸드 데이터 로드
path_food = "./data/EC_SNS_KFOOD_ATTRACTION_DATA_*.csv"
files_food = glob.glob(path_food)
df_food = [pd.read_csv(f, encoding='utf-8') for f in files_food]
sns_kfood = pd.concat(df_food, ignore_index=True)

# 뷰티 데이터 로드
path_beauty = "./data/EC_SNS_KBEAUTY_ATTRACTION_DATA_*.csv"
files_beauty = glob.glob(path_beauty)
df_beauty = [pd.read_csv(f, encoding='utf-8') for f in files_beauty]
sns_beauty = pd.concat(df_beauty, ignore_index=True)

# 데이터 병합
full_df = pd.concat([sns_kfood, sns_beauty], ignore_index=True)

In [310]:
# 채널 이름이 체널 전체인 것들만 추출 후 0으로 초기화
full_df = full_df[full_df['CHNNEL_NM'] == '채널전체'].copy()
full_df = full_df.fillna(0)

# Unnamed 컬럼 제거
full_df = full_df.loc[:, ~full_df.columns.astype(str).str.startswith("Unnamed")]

# 이상 지역 제거
full_df = full_df[full_df["SIGNGU_NM"].astype(str) != "0"]
full_df = full_df[full_df["SIGNGU_NM"].astype(str) != "KR"]

# 사용할 컬럼 추출
use_cols = ['BASE_YM', 'CTPRVN_NM', 'SIGNGU_NM',
            'BASE_YEAR_ACCMLT_FQ_CO',       # 기준년도누적빈도수
            'TURSM_CSTMR_CO',               # 관광객 수
            'TURSM_SPND_PRICE',             # 관광소비금액 (총합)
            'AVRG_SCORE_VALUE',             # 평점값
            'REVIEW_CO',                    # 리뷰수
        ]
df_sub = full_df[use_cols].copy()

# 데이터 형 변환
df_sub['BASE_YM'] = pd.to_datetime(df_sub['BASE_YM'], format='%Y%m')
df_sub = df_sub.fillna(0)

num_cols = [
    'TURSM_CSTMR_CO',
    'TURSM_SPND_PRICE',
    'AVRG_SCORE_VALUE',
    'REVIEW_CO',
    'BASE_YEAR_ACCMLT_FQ_CO'
]
for c in num_cols:
    df_sub[c] = pd.to_numeric(df_sub[c], errors='coerce')


In [311]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# 년 월 시 군구로 집계
final_df = df_sub.groupby(['BASE_YM', 'CTPRVN_NM', 'SIGNGU_NM']).agg({
    'TURSM_SPND_PRICE': 'mean',          # 관광소비금액
    'TURSM_CSTMR_CO': 'mean',            # 관광고객수
    'AVRG_SCORE_VALUE': 'mean',          # 평점값
    'BASE_YEAR_ACCMLT_FQ_CO': 'sum',     # 기준년월 sns 언급 빈도수
    'REVIEW_CO':'sum'                    # 리뷰수
}).reset_index()

# 정렬
final_df = final_df.sort_values(["CTPRVN_NM", "SIGNGU_NM", "BASE_YM"]).reset_index(drop=True)

In [312]:
# final_df.to_csv('02101236.csv')

In [313]:
# 시군구별 그룹
group_cols = ["CTPRVN_NM", "SIGNGU_NM"]
base_features = ['TURSM_SPND_PRICE', 'TURSM_CSTMR_CO', 'AVRG_SCORE_VALUE', 'BASE_YEAR_ACCMLT_FQ_CO', 'REVIEW_CO']

# 과거 값 가져오기
for col in base_features:
    final_df[f"{col}_L1"] = final_df.groupby(group_cols)[col].shift(1) # 1개월 전
    final_df[f"{col}_L2"] = final_df.groupby(group_cols)[col].shift(2) # 2개월 전

# 전월 대비 성장률 구하기
for col in base_features:
    prev = final_df.groupby(group_cols)[col].shift(1)
    final_df[f"{col}_MOM_RATE"] = (final_df[col] - prev) / prev.replace(0, np.nan)

# 최근 3개월 추세(롤링)
for col in base_features:
    final_df[f"{col}_ROLL3"] = final_df.groupby(group_cols)[col].transform(
        lambda s: s.shift(1).rolling(3, min_periods=1).mean()
    )

# 무한대/결측 정리
final_df = final_df.replace([np.inf, -np.inf], np.nan)

# 다음달 상승/하락
final_df["CSTMR_NEXT"] = final_df.groupby(group_cols)["TURSM_SPND_PRICE"].shift(-1)

# 마지막 달은 학습에서 제외
final_df["NEXT_GROWTH_LABEL"] = np.where(
    final_df["CSTMR_NEXT"].isna(),
    np.nan,
    (final_df["CSTMR_NEXT"] > final_df["TURSM_SPND_PRICE"]).astype(int)
)

In [314]:
fc = final_df.columns
fc = fc.drop(['BASE_YM', 'CTPRVN_NM', 'SIGNGU_NM', 'CSTMR_NEXT', 'NEXT_GROWTH_LABEL'])
fc.to_list()
final_df[fc]

Unnamed: 0,TURSM_SPND_PRICE,TURSM_CSTMR_CO,AVRG_SCORE_VALUE,BASE_YEAR_ACCMLT_FQ_CO,REVIEW_CO,TURSM_SPND_PRICE_L1,TURSM_SPND_PRICE_L2,TURSM_CSTMR_CO_L1,TURSM_CSTMR_CO_L2,AVRG_SCORE_VALUE_L1,...,TURSM_SPND_PRICE_MOM_RATE,TURSM_CSTMR_CO_MOM_RATE,AVRG_SCORE_VALUE_MOM_RATE,BASE_YEAR_ACCMLT_FQ_CO_MOM_RATE,REVIEW_CO_MOM_RATE,TURSM_SPND_PRICE_ROLL3,TURSM_CSTMR_CO_ROLL3,AVRG_SCORE_VALUE_ROLL3,BASE_YEAR_ACCMLT_FQ_CO_ROLL3,REVIEW_CO_ROLL3
0,4418666.0,1177865.0,4.2,8.0,100.0,,,,,,...,,,,,,,,,,
1,14905735.0,2303350.0,3.8,10.0,123.0,,,,,,...,,,,,,,,,,
2,15084889.0,2561433.0,3.8,25.0,123.0,14905735.0,,2303350.0,,3.8,...,0.012019,0.112047,0.0,1.500000,0.000000,1.490574e+07,2.303350e+06,3.8,10.000000,123.000000
3,14972365.0,2291044.0,3.8,25.0,127.0,15084889.0,14905735.0,2561433.0,2303350.0,3.8,...,-0.007459,-0.105562,0.0,0.000000,0.032520,1.499531e+07,2.432392e+06,3.8,17.500000,123.000000
4,19961621.0,3197509.0,3.8,25.0,128.0,14972365.0,15084889.0,2291044.0,2561433.0,3.8,...,0.333231,0.395656,0.0,0.000000,0.007874,1.498766e+07,2.385276e+06,3.8,20.000000,124.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
851,11545845.0,2439528.0,4.4,3.0,2702.0,,,,,,...,,,,,,,,,,
852,398731.0,2489592.0,4.4,4.0,2728.0,11545845.0,,2439528.0,,4.4,...,-0.965465,0.020522,0.0,0.333333,0.009623,1.154584e+07,2.439528e+06,4.4,3.000000,2702.000000
853,59893391.0,2657098.0,4.4,4.0,2750.0,398731.0,11545845.0,2489592.0,2439528.0,4.4,...,149.210019,0.067283,0.0,0.000000,0.008065,5.972288e+06,2.464560e+06,4.4,3.500000,2715.000000
854,59698360.0,2734144.0,4.4,4.0,2765.0,59893391.0,398731.0,2657098.0,2489592.0,4.4,...,-0.003256,0.028996,0.0,0.000000,0.005455,2.394599e+07,2.528739e+06,4.4,3.666667,2726.666667


In [315]:

df_all = final_df.copy()
feature_cols = final_df.columns.drop(['BASE_YM', 'CTPRVN_NM', 'SIGNGU_NM', 'CSTMR_NEXT', 'NEXT_GROWTH_LABEL']).tolist()

train_df = df_all.dropna(subset=feature_cols + ["NEXT_GROWTH_LABEL"]).copy()
pred_df  = df_all[df_all["NEXT_GROWTH_LABEL"].isna()].dropna(subset=feature_cols).copy()

X_train, X_test, y_train, y_test = train_test_split(
    train_df[feature_cols],
    train_df["NEXT_GROWTH_LABEL"],
    test_size=0.2,
    random_state=0,
    stratify=train_df["NEXT_GROWTH_LABEL"]
)

# RandomForest로 확률 예측
rf = RandomForestClassifier(
    n_estimators=1200,
    min_samples_leaf=2,
    class_weight="balanced",    # 라벨이 불균형(예: 성장=1이 적음)일 때 소수 클래스에 더 큰 가중치를 줘서 학습이 그쪽을 무시하지 않게 함.
    random_state=42,
    n_jobs=-1                   # cpu
)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test) # 0/1 예측
# 예측
pred_df["proba_up"] = rf.predict_proba(pred_df[feature_cols])[:, 1]
pred_df["proba_down"] = 1 - pred_df["proba_up"]
pred_df["pred_label"] = (pred_df["proba_up"] >= 0.5).astype(int)

# pred_df.sort_values("proba_up", ascending=False)

# top10 = pred_df.sort_values("proba_up", ascending=False).head(10)

for _, r in pred_df.iterrows():
    label = 1 if r["proba_up"] >= 0.5 else 0
    print(f"지역 : {r['CTPRVN_NM']} {r['SIGNGU_NM']}")
    print(f"예측 결과: 인기 {'상승' if label==1 else '하락'}")
    print(f"상승 확률: {r['proba_up']:.2f}")
    print(f"하락 확률: {r['proba_down']:.2f}")
    print("-"*40)


지역 : 강원특별자치도 강릉시
예측 결과: 인기 하락
상승 확률: 0.23
하락 확률: 0.77
----------------------------------------
지역 : 강원특별자치도 속초시
예측 결과: 인기 상승
상승 확률: 0.54
하락 확률: 0.46
----------------------------------------
지역 : 강원특별자치도 춘천시
예측 결과: 인기 상승
상승 확률: 0.59
하락 확률: 0.41
----------------------------------------
지역 : 강원특별자치도 홍천군
예측 결과: 인기 하락
상승 확률: 0.40
하락 확률: 0.60
----------------------------------------
지역 : 경기도 가평군
예측 결과: 인기 하락
상승 확률: 0.43
하락 확률: 0.57
----------------------------------------
지역 : 경기도 고양시
예측 결과: 인기 하락
상승 확률: 0.24
하락 확률: 0.76
----------------------------------------
지역 : 경기도 고양시 일산서구
예측 결과: 인기 상승
상승 확률: 0.58
하락 확률: 0.42
----------------------------------------
지역 : 경기도 과천시
예측 결과: 인기 하락
상승 확률: 0.35
하락 확률: 0.65
----------------------------------------
지역 : 경기도 구리시
예측 결과: 인기 상승
상승 확률: 0.78
하락 확률: 0.22
----------------------------------------
지역 : 경기도 성남시 분당구
예측 결과: 인기 상승
상승 확률: 0.51
하락 확률: 0.49
----------------------------------------
지역 : 경기도 수원시
예측 결과: 인기 하락
상승 확률: 0.24
하락 확률: 0.76
---------------

---

In [316]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))

Accuracy: 0.7195121951219512
[[33  9]
 [14 26]]
              precision    recall  f1-score   support

         0.0     0.7021    0.7857    0.7416        42
         1.0     0.7429    0.6500    0.6933        40

    accuracy                         0.7195        82
   macro avg     0.7225    0.7179    0.7175        82
weighted avg     0.7220    0.7195    0.7180        82



In [317]:
fi = pd.Series(rf.feature_importances_, index=feature_cols).sort_values(ascending=False)
fi_df = fi.reset_index()
fi_df.columns = ["feature", "importance"]
fi_df.head(15)

Unnamed: 0,feature,importance
0,TURSM_SPND_PRICE_MOM_RATE,0.160195
1,BASE_YEAR_ACCMLT_FQ_CO_MOM_RATE,0.133977
2,TURSM_CSTMR_CO_MOM_RATE,0.119809
3,TURSM_SPND_PRICE_L2,0.049561
4,TURSM_SPND_PRICE_ROLL3,0.047081
5,TURSM_CSTMR_CO_L1,0.042106
6,TURSM_SPND_PRICE_L1,0.04066
7,TURSM_CSTMR_CO_L2,0.034506
8,REVIEW_CO_MOM_RATE,0.033844
9,TURSM_SPND_PRICE,0.033398
