In [None]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import optuna
from datetime import datetime
from preprocess.data_loader import load_data
from preprocess.preprocessing import preprocess
from preprocess.feature_selection import select_features
from models.dnn import build_dnn
from models.traditional import train_traditional_models
from evaluations.evaluation import evaluate_model
from configs import config
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from pytorch_tabnet.tab_model import TabNetClassifier
import os


In [None]:
# Optuna objective: DNN 하이퍼파라미터 자동탐색 함수
def objective_dnn(trial, X_train, y_train):
    # 아키텍처(은닉층 구성)와 드롭아웃 비율 탐색
    layers = trial.suggest_categorical(
        'layers',
        [(512, 256), (1024, 512, 256), (1024, 512, 256, 128)]
    )
    dropout = trial.suggest_float('dropout', 0.1, 0.5)

    # 후보 파라미터로 모델 생성 및 훈련
    model = build_dnn(X_train.shape[1], layers, dropout=dropout, optimizer='Adam')
    history = model.fit(
        X_train, y_train,
        epochs=30,
        batch_size=32,
        validation_split=0.2,
        verbose=0
    )

    # 검증 정확도 최대값 기준으로 탐색
    return max(history.history['val_accuracy'])


# Optuna objective: 랜덤포레스트 하이퍼파라미터 자동탐색 함수
def objective_rf(trial, X_train, y_train):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import cross_val_score

    # 탐색할 파라미터 공간
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10)
    }
    rf = RandomForestClassifier(**params, random_state=config.SEED)
    # 교차검증(3-fold) 평균 정확도로 평가
    return cross_val_score(rf, X_train, y_train, cv=3, scoring='accuracy').mean()


# Optuna objective: TabNet 하이퍼파라미터 자동탐색 함수
def objective_tabnet(trial, X_train, y_train, X_valid, y_valid):
    # 주요 하이퍼파라미터 범위 정의
    params = {
        'n_d': trial.suggest_int('n_d', 8, 64),
        'n_a': trial.suggest_int('n_a', 8, 64),
        'n_steps': trial.suggest_int('n_steps', 3, 10),
        'gamma': trial.suggest_float('gamma', 1.0, 2.0),
        'lambda_sparse': trial.suggest_float('lambda_sparse', 1e-5, 1e-1, log=True)
    }
    clf = TabNetClassifier(**params, seed=config.SEED)
    clf.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        patience=10,
        max_epochs=100,
        batch_size=1024
    )
    preds = clf.predict(X_valid)
    # 정확도 기준
    return evaluate_model(y_valid, preds)['accuracy']



In [None]:

def main(search_time_minutes=5):
    # 1. 데이터 로딩 및 기본 전처리
    df_original = load_data(config.DATA_PATH)
    df, numeric_cols, categorical_cols = preprocess(df_original)

    # 2. 타겟 레이블 그룹화
    df['mh_PHQ_S_grouped'] = df['mh_PHQ_S'].apply(lambda x: 0 if x <= 4 else 1 if x <= 9 else 2)
    X = df.drop(['mh_PHQ_S', 'mh_PHQ_S_grouped'], axis=1)
    y = df['mh_PHQ_S_grouped']

    # 3. 결측치 평균 대치
    X = pd.DataFrame(SimpleImputer(strategy='mean').fit_transform(X), columns=X.columns)
    # 4. 학습/테스트 분할
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=config.SEED)
    # 5. 피처 셀렉션
    X_train_selected, selected_features, selector = select_features(X_train, y_train, config.SELECTED_FEATURES)
    X_test_selected = selector.transform(X_test)
    # 6. 스케일링
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)

    # 7. 탐색 시간(초) 설정
    timeout = search_time_minutes * 60

    # 8. Optuna로 각 모델별 하이퍼파라미터 탐색 (탐색시간=timeout)
    studies = {}
    for model_name, objective in [('DNN', objective_dnn), ('RandomForest', objective_rf)]:
        study = optuna.create_study(direction='maximize')
        study.optimize(lambda trial: objective(trial, X_train_scaled, y_train), timeout=timeout)
        studies[model_name] = study

    # TabNet은 float32로 입력!
    tabnet_study = optuna.create_study(direction='maximize')
    tabnet_study.optimize(
        lambda trial: objective_tabnet(
            trial,
            X_train_scaled.astype(np.float32), y_train.values,
            X_test_scaled.astype(np.float32), y_test.values
        ),
        timeout=timeout
    )
    studies['TabNet'] = tabnet_study

    # 9. 모델별 최적 파라미터로 최종 학습/테스트 및 성능 기록
    model_records = []
    param_records = []

    for model_name, study in studies.items():
        best_params = study.best_params
        if model_name == 'DNN':
            model = build_dnn(
                X_train_scaled.shape[1],
                best_params['layers'],
                dropout=best_params['dropout']
            )
            model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=0)
            y_pred = model.predict(X_test_scaled).argmax(axis=1)
        elif model_name == 'RandomForest':
            from sklearn.ensemble import RandomForestClassifier
            model = RandomForestClassifier(**best_params, random_state=config.SEED)
            model.fit(X_train_scaled, y_train)
            y_pred = model.predict(X_test_scaled)
        elif model_name == 'TabNet':
            model = TabNetClassifier(**best_params, seed=config.SEED)
            model.fit(X_train_scaled.astype(np.float32), y_train.values, max_epochs=100, batch_size=1024)
            y_pred = model.predict(X_test_scaled.astype(np.float32))

        # 평가 함수(정확도, 정밀도, 재현율, F1 등)
        metrics = evaluate_model(y_test, y_pred)

        model_records.append({
            'Model': model_name,
            **metrics,
        })

        param_records.append({
            'Model': model_name,
            'Hyperparameters': best_params
        })

    # 10. 결과를 reports/ 폴더에 엑셀로 저장 (자동 폴더 생성)
    now = datetime.now().strftime("%Y%m%d_%H%M%S")
    os.makedirs("reports", exist_ok=True)
    excel_filename = f"reports/{now}_detailed_model_report.xlsx"
    with pd.ExcelWriter(excel_filename, engine='xlsxwriter') as writer:
        pd.DataFrame(model_records).to_excel(writer, sheet_name='Performance', index=False)
        pd.DataFrame(param_records).to_excel(writer, sheet_name='Hyperparameters', index=False)

    print(f"Detailed report saved to '{excel_filename}'")

if __name__ == "__main__":
    # search_time_minutes: 모델별 탐색시간(분), 원하는 대로 조정
    main(search_time_minutes=5)