# 중요한 Featrue 확인

In [101]:
import os
import warnings
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import pandas as pd
import seaborn as sns
from dotenv import load_dotenv

In [102]:
# Warning 무시
warnings.filterwarnings('ignore', category=UserWarning)

# Seaborn 테마 설정
sns.set_theme(style='whitegrid')

# matplotlib 한글 폰트 설정
os.system('rm -rf ~/.cache/matplotlib')
os.system('fc-cache -fv')

plt.rc('font', family='NanumBarunGothic')
plt.rcParams['axes.unicode_minus'] = False

## 데이터 확인

In [None]:
def file_load(folder_path) :
    """
        folder_path : str
            data 폴더 안에 있는 폴더 ex) feature_30_total
    """
    file_path = f'../data/{folder_path}/feature_importance_2021_1.csv'
    df = pd.read_csv(file_path, index_col= 0)

    # 빈 데이터프레임 생성
    total_feature_df = pd.DataFrame()
    total_feature_df['Feature_Name'] = [col[5:] for col in df.sort_values(by = 'Feature_Index')['Feature_Name'].tolist()]

    for year in range(2019, 2024):
        for quart in range(1, 5):
            # CSV 파일 읽기
            file_path = f'../data/{folder_path}/feature_importance_{year}_{quart}.csv'
            df = pd.read_csv(file_path, index_col= 0)
            df['Feature_Name'] = [col[5:] for col in df['Feature_Name'].tolist()]
            df= df.rename(columns= {'Importance' : f'{year}{quart}_importance'}).drop(columns=["Feature_Index"])
            total_feature_df = pd.merge(total_feature_df, df, left_on = "Feature_Name", right_on = "Feature_Name", how = 'outer')
    
    return total_feature_df.set_index('Feature_Name')

- 전체 군집에 대한 데이터 확인

In [112]:
df_10_total = file_load('feature_10_total')
df_10_total.head()

Unnamed: 0_level_0,20191_importance,20192_importance,20193_importance,20194_importance,20201_importance,20202_importance,20203_importance,20204_importance,20211_importance,20212_importance,20213_importance,20214_importance,20221_importance,20222_importance,20223_importance,20224_importance,20231_importance,20232_importance,20233_importance,20234_importance
Feature_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1층외임대료,0.0,0.0,0.0,0.0,0.0,0.000472,0.00679,0.006601,0.0,0.000899,0.012735,0.044199,0.010318,0.0,0.00088,0.003069,0.009138,0.0,0.000762,0.0
1층임대료,0.0,0.0,0.0,0.0,0.0,0.000943,0.008558,0.005693,0.0,0.000293,0.007725,0.037261,0.013977,6.6e-05,0.001467,0.003042,0.007935,0.0,0.0,0.000882
개업점포수,0.002696,0.007876,0.001512,0.000229,0.0,0.000472,0.010921,0.010231,0.0,0.000293,0.012641,0.040319,0.003415,0.0,0.004966,0.00177,0.002743,0.0,0.0,0.000881
금요일매출건수,0.0,0.006666,0.004229,0.000687,0.0,0.000472,0.0,0.017327,0.0,0.001264,0.014474,0.03513,0.003013,1.3e-05,0.000282,0.002481,0.007075,0.0,0.0,0.0
금요일매출금액,0.0,0.003636,0.001209,0.0,0.0,0.000236,0.0,0.011634,0.0,0.0,0.009164,0.032209,0.002421,0.0,0.004673,0.002808,0.00731,0.0,0.0,0.0


In [115]:
# 전체 변화율의 평균
df_10_total['row_mean'] = df_10_total.mean(axis=1)
df_10_total_sort = df_10_total.sort_values(by = 'row_mean', ascending= False)['row_mean'].reset_index()

# 전체 평균 TOP 5
df_10_total_sort.head()

Unnamed: 0,Feature_Name,row_mean
0,여성유동인구수,0.008494
1,시간대2124유동인구수,0.008116
2,연령대60이상상주인구수,0.00795
3,여성연령대60이상상주인구수,0.007872
4,금요일유동인구수,0.007778


- 1~2 군집 변동에 대한 분석
- 데이터 10% 변동

In [118]:
df_10_12 = file_load('feature_10_12')

# 전체 변화율의 평균
df_10_12['row_mean'] = df_10_12.mean(axis=1)
df_10_12_sort = df_10_12.sort_values(by = 'row_mean', ascending= False)['row_mean'].reset_index()

# 전체 평균 TOP 5
df_10_12_sort.head()

Unnamed: 0,Feature_Name,row_mean
0,총유동인구수,0.000457
1,남성연령대40상주인구수,0.000414
2,연령대40상주인구수,0.000397
3,시간대0611유동인구수,0.000384
4,남성연령대10상주인구수,0.00038


- 데이터 30% 변동

In [119]:
df_30_12 = file_load('feature_30_12')

# 전체 변화율의 평균
df_30_12['row_mean'] = df_30_12.mean(axis=1)
df_30_12_sort = df_30_12.sort_values(by = 'row_mean', ascending= False)['row_mean'].reset_index()

# 전체 평균 TOP 5
df_30_12_sort.head()

Unnamed: 0,Feature_Name,row_mean
0,연령대60이상유동인구수,0.000472
1,여성연령대40상주인구수,0.000449
2,시간대0006유동인구수,0.000443
3,여성상주인구수,0.000431
4,연령대60이상상주인구수,0.000415
