In [None]:
# 기본
import pandas as pd
import numpy as np

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

## 기본 root path 설정 (local 인지 colab인지 확인)

In [None]:
### local
root_path = '../data/open'

### colab
# root_path = '/content/drive/MyDrive/12조 파이널프로젝트/data'

In [None]:
#file_path = f"{root_path}/{split}/{folder}/201812_corr_drop_All.parquet"
file_path = f"{root_path}/train/corr_drop_All.parquet"
train_df = pd.read_parquet(file_path)

# 기준 컬럼 정의
exclude_cols = ['ID', '기준년월']
segment_cols = [col for col in train_df.columns if col.startswith('Segment_')]
vif_exclude = exclude_cols + segment_cols

# 숫자형 컬럼에서 VIF 계산 대상만 선택
numeric_cols = train_df.select_dtypes(include=['number']).columns.tolist()
vif_features = [col for col in numeric_cols if col not in vif_exclude]

# 결측치 제거 (VIF 계산용)
X_numeric = train_df[vif_features].dropna()

# 상수항 추가
X_with_const = add_constant(X_numeric)

# VIF 계산
vif_data = pd.DataFrame()
vif_data["feature"] = X_numeric.columns
vif_data["VIF"] = [variance_inflation_factor(X_with_const.values, i + 1) for i in range(len(X_numeric.columns))]

# VIF > 10인 컬럼 제거
high_vif_cols = vif_data[vif_data["VIF"] > 10]["feature"].tolist()
vif_filtered_cols = [col for col in vif_features if col not in high_vif_cols]

# 원본 train_df에서 최종 컬럼 선택 (VIF 통과한 컬럼 + 제외했던 컬럼들)
final_cols = exclude_cols + vif_filtered_cols + segment_cols
train_df_reduced = train_df[final_cols]

# 결과 확인
print("제거된 컬럼 수:", len(high_vif_cols))
print("최종 컬럼 수:", len(train_df_reduced.columns))

# 필요시 저장
#train_df_reduced.to_parquet(f"{root_path}/train/201812_vif_drop_All.parquet", index=False)
train_df_reduced.to_parquet(f"{root_path}/train/vif_drop_All.parquet", index=False)