In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# 각 fold의 하이퍼파라미터
rf_params_by_fold = {
    'Fold 1': {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 40, 'class_weight': 'balanced'},
    'Fold 2': {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 40, 'class_weight': None},
    'Fold 3': {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 50, 'class_weight': 'balanced_subsample'},
    'Fold 4': {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 40, 'class_weight': None},
    'Fold 5': {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 30, 'class_weight': 'balanced_subsample'}
}

# 데이터프레임으로 변환
rf_params_df = pd.DataFrame(rf_params_by_fold).T
rf_params_df

Unnamed: 0,n_estimators,min_samples_split,min_samples_leaf,max_features,max_depth,class_weight
Fold 1,300,2,2,log2,40,balanced
Fold 2,100,2,1,log2,40,
Fold 3,100,5,2,log2,50,balanced_subsample
Fold 4,300,5,2,log2,40,
Fold 5,200,5,1,log2,30,balanced_subsample


In [5]:
RF_FINAL_PARAMS = {
    'n_estimators': 200,        # 중간값 선택
    'min_samples_split': 5,     # 가장 많이 선택됨 (3/5)
    'min_samples_leaf': 2,      # 가장 많이 선택됨 (3/5)
    'max_features': 'log2',     # 모든 fold에서 일관되게 선택됨
    'max_depth': 40,            # 가장 많이 선택됨 (3/5)
    'class_weight': 'balanced_subsample'  # 가장 많이 선택된 유효값 (2/5)
}

In [6]:
# 각 fold의 하이퍼파라미터
lr_params_by_fold = {
    'Fold 1': {'solver': 'saga', 'penalty': 'elasticnet', 'max_iter': 2000, 'l1_ratio': 0.5, 'class_weight': None, 'C': 0.000774},
    'Fold 2': {'solver': 'saga', 'penalty': 'l1', 'max_iter': 2000, 'class_weight': None, 'C': 0.005995},
    'Fold 3': {'solver': 'saga', 'penalty': 'elasticnet', 'max_iter': 1000, 'l1_ratio': 0.9, 'class_weight': None, 'C': 0.046416},
    'Fold 4': {'solver': 'liblinear', 'penalty': 'l1', 'class_weight': None, 'C': 0.005995},
    'Fold 5': {'solver': 'liblinear', 'penalty': 'l1', 'class_weight': None, 'C': 0.005995}
}

# 데이터프레임으로 변환 (l1_ratio는 일부 fold에만 있으므로 제외)
lr_clean_params = {}
for fold, params in lr_params_by_fold.items():
    lr_clean_params[fold] = {k: v for k, v in params.items() if k != 'l1_ratio'}
    
lr_params_df = pd.DataFrame(lr_clean_params).T
lr_params_df

Unnamed: 0,solver,penalty,max_iter,class_weight,C
Fold 1,saga,elasticnet,2000.0,,0.000774
Fold 2,saga,l1,2000.0,,0.005995
Fold 3,saga,elasticnet,1000.0,,0.046416
Fold 4,liblinear,l1,,,0.005995
Fold 5,liblinear,l1,,,0.005995


In [7]:
# 각 하이퍼파라미터의 빈도 분석
lr_param_freq = {}

for param in ['solver', 'penalty', 'max_iter', 'class_weight', 'C']:
    if param == 'max_iter':  # NaN 값 처리
        values = lr_params_df[param].dropna().value_counts().to_dict()
    else:
        values = lr_params_df[param].value_counts().to_dict()
    lr_param_freq[param] = values

# 시각화
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, (param, freq) in enumerate(lr_param_freq.items()):
    if i < len(axes):
        axes[i].bar(freq.keys(), freq.values())
        axes[i].set_title(f'Frequency of {param}')
        axes[i].tick_params(axis='x', rotation=45)
    
plt.tight_layout()
plt.savefig("lr_hyperparameter_frequency.png")
plt.close()

In [8]:
LR_FINAL_PARAMS = {
    'C': 0.005995,         # 가장 많이 선택됨 (3/5)
    'penalty': 'l1',       # 가장 많이 선택됨 (3/5)
    'solver': 'liblinear', # l1 penalty와 호환되는 solver 중 가장 많이 선택됨
    'max_iter': 2000,      # 가장 많이 선택됨
    'class_weight': None   # 모든 fold에서 선택됨
}