In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 학습 모델 저장을 위한 라이브러리
import pickle

from sklearn.metrics import confusion_matrix, classification_report

In [40]:
#data_type = "train"
# month = "07"
# category = "잔액정보"

# local
root_path = '../data/open/머신러닝'

# colab
# root_path = '/content/drive/MyDrive/12조 파이널프로젝트/data'

drive_folder = f'{root_path}'

In [42]:
# 통합 parquet
df1 = pd.read_parquet(f'{drive_folder}/xgb_top8_All.parquet')

## 머신러닝 (LGBM)

### 학습

In [25]:
# 1. Segment 라벨 만들기
segment_cols = ['Segment_A', 'Segment_B', 'Segment_C', 'Segment_D', 'Segment_E']
y = df1[segment_cols].idxmax(axis=1).map({
    'Segment_A': 0,
    'Segment_B': 1,
    'Segment_C': 2,
    'Segment_D': 3,
    'Segment_E': 4
})

In [26]:
# 2. X 구성
X = df1.drop(columns=segment_cols + ['ID', '기준년월'])

In [29]:
# 3. 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [32]:
# 4. LGBM 모델 정의 및 학습
model = LGBMClassifier(
    objective='multiclass',
    num_class=5,
    metric='multi_logloss',   # 그대로 둬도 되지만,
    n_estimators=300,
    learning_rate=0.05,       # 기본보다 조금 더 부드러운 학습
    max_depth=7,              # 너무 깊지 않게 (복잡도 제한)
    subsample=0.8,            # 과적합 방지
    colsample_bytree=0.8,     # 피처 샘플링
    random_state=42
)
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.843938 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8663
[LightGBM] [Info] Number of data points in the train set: 1920000, number of used features: 64
[LightGBM] [Info] Start training from score -7.811109
[LightGBM] [Info] Start training from score -9.722904
[LightGBM] [Info] Start training from score -2.934402
[LightGBM] [Info] Start training from score -1.927457
[LightGBM] [Info] Start training from score -0.222076


In [34]:
# 5. 정확도 평가
val_preds = model.predict(X_val)
acc = accuracy_score(y_val, val_preds)
print(f"✅ Validation Accuracy: {acc:.4f}")

✅ Validation Accuracy: 0.8988


### 예측

In [123]:
df2 = pd.read_parquet(f'{drive_folder}/test_12월_통합.parquet')

In [125]:
# 1. 학습에 사용된 피처 리스트 가져오기
#    - LGBM 모델이 학습에 사용한 컬럼 이름 (순서 포함)
train_features = model.feature_name_

In [127]:
# 2. X_test 구성: 'ID', '기준년월'은 제외
X_test = df2.drop(columns=['ID', '기준년월'], errors='ignore')

In [129]:
# 3. 누락된 컬럼 처리 (train에는 있었지만 test에는 없는 컬럼 → 0으로 추가) 
# !!(코드 수정해야 할 가능성 있음)
missing_cols = [col for col in train_features if col not in X_test.columns]
for col in missing_cols:
    X_test[col] = 0  # 또는 np.nan (LGBM은 NaN 허용 가능)

In [131]:
# 4. 컬럼 순서 train과 동일하게 맞추기
X_test = X_test[train_features]

In [135]:
# 6. 예측 수행
test_preds = model.predict(X_test)

In [137]:
# 7. 숫자 라벨 → 문자 라벨 복원 (컬럼명도 'Segment'로 맞춤)
label_map = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E'}
df2['Segment'] = pd.Series(test_preds).map(label_map)

In [139]:
# 8. 결과 확인
print("✅ 예측 완료! 상위 5개:")
print(df2[['ID', 'Segment']].head())

✅ 예측 완료! 상위 5개:
           ID Segment
0  TEST_00000       E
1  TEST_00001       E
2  TEST_00002       E
3  TEST_00003       E
4  TEST_00004       E


In [141]:
# 9. 최종 결과만 저장
df2[['ID', 'Segment']].to_parquet(f'{drive_folder}/LGBM_segment_pred.parquet', index=False)
print("✅ 저장 완료: test_12월_LGBM_segment_pred.parquet (ID, Segment)")

✅ 저장 완료: test_12월_LGBM_segment_pred.csv (ID, Segment)


In [147]:
df2 = pd.read_parquet(f'{drive_folder}/LGBM_segment_pred.parquet')
df2

Unnamed: 0,ID,Segment
0,TEST_00000,E
1,TEST_00001,E
2,TEST_00002,E
3,TEST_00003,E
4,TEST_00004,E
...,...,...
99995,TEST_99995,E
99996,TEST_99996,E
99997,TEST_99997,E
99998,TEST_99998,C
