In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, f1_score, log_loss
import joblib
import logging
from time import time

import warnings

# 경고를 무시
warnings.filterwarnings('ignore')

In [2]:
# 데이터 로드
train = pd.read_csv('dataset/train.csv',  index_col='ID')
test = pd.read_csv('dataset/test.csv', index_col='ID')

In [3]:
# 2. 타겟 값 'SUBCLASS' 분리
y = train['SUBCLASS']
X = train.drop(columns=['SUBCLASS'])

In [4]:
# 3. 범주형과 숫자형 열 구분 후 MultiLabelBinarizer를 통한 인코딩
# 숫자형과 범주형 분리
categorical_columns = X.select_dtypes(include=['object']).columns
numerical_columns = X.select_dtypes(exclude=['object']).columns
print(categorical_columns.shape)

(4384,)


In [6]:
# MultiLabelBinarizer 인코딩
mlb_dict = {}
X_encoded = X.copy()

In [7]:
for col in categorical_columns:
    mlb = MultiLabelBinarizer()
    X_encoded[col] = X_encoded[col].astype(str).str.split(' ')
    X_encoded = X_encoded.join(pd.DataFrame(mlb.fit_transform(X_encoded[col]),
                                            columns=[f'{col}_{cls}' for cls in mlb.classes_],
                                            index=X_encoded.index))
    X_encoded.drop(columns=[col], inplace=True)
    mlb_dict[col] = mlb

In [8]:
joblib.dump(mlb_dict, 'mlb_dict.joblib')

['mlb_dict.joblib']

In [8]:
len(mlb_dict) # 4384 -> 231179로 채널이 많이 늘어남

4384

In [7]:
X_encoded_pca.shape

(6201, 3168)

In [15]:
from sklearn.decomposition import PCA
# 4. PCA 적용 (설명 분산 비율 95% 설정)
pca = PCA(n_components=0.95, random_state=42)
X_encoded_pca = pca.fit_transform(X_encoded)

In [16]:
joblib.dump(pca, 'pca.joblib')

['pca.joblib']

In [19]:
joblib.dump(X_encoded_pca, 'X_encoded_pca.csv')

['X_encoded_pca.csv']

In [6]:
# 처리했던 것들을 불러오는 부분
mlb_dict = joblib.load('./joblib/mlb_dict.joblib')
pca=joblib.load('./joblib/pca.joblib')
X_encoded_pca=joblib.load('./joblib/X_encoded_pca.csv')

In [7]:
# 4. train_test_split 사용
X_train, X_val, y_train, y_val = train_test_split(X_encoded_pca, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
# 1. 로깅 설정
logging.basicConfig(filename='model_performance.log', level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')

In [9]:
# 4. 성능 평가 함수
def evaluate_model(model, X_train, y_train, X_val, y_val, model_name):
    start_time = time()
    model.fit(X_train, y_train)
    joblib.dump(model, f'./trained_model/{model_name}.joblib')
    train_time = time() - start_time

    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average='weighted')

    log_message = f"{model_name}: Accuracy={accuracy:.4f}, F1-Score={f1:.4f}, Train Time={train_time:.2f}s"
    logging.info(log_message)
    print(log_message)

    return accuracy, f1

In [17]:
# 5. 모델 리스트 추가
models = {
    # "Random Forest": RandomForestClassifier(random_state=42),
    # "XGBoost": XGBClassifier(use_label_encoder=True, eval_metric='mlogloss', gpu_id=0, tree_method='gpu_hist'), // 실행안됨
    "LightGBM": LGBMClassifier(device='cpu'),
    "CatBoost": CatBoostClassifier(task_type="GPU", silent=True),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

In [18]:
# 6. 각 모델에 대해 학습 및 성능 평가
results = {}
for model_name, model in models.items():
    accuracy, f1 = evaluate_model(model, X_train, y_train, X_val, y_val, model_name)
    results[model_name] = {'Accuracy': accuracy, 'F1-Score': f1}

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.406018 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 807840
[LightGBM] [Info] Number of data points in the train set: 4960, number of used features: 3168
[LightGBM] [Info] Start training from score -4.448718
[LightGBM] [Info] Start training from score -4.090320
[LightGBM] [Info] Start training from score -2.065030
[LightGBM] [Info] Start training from score -3.688879
[LightGBM] [Info] Start training from score -3.327377
[LightGBM] [Info] Start training from score -5.075174
[LightGBM] [Info] Start training from score -2.598364
[LightGBM] [Info] Start training from score -3.327377
[LightGBM] [Info] Start training from score -2.488138
[LightGBM] [Info] Start training from score -2.921912
[LightGBM] [Info] Start training from score -3.672879
[LightGBM] [Info] Start training from score -3.299675
[LightGBM] [Info] Start training from score -3.664974
[Light

In [None]:
# 7. 스태킹 (Stacking Classifier)
estimators = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('xgb', XGBClassifier(eval_metric='mlogloss', gpu_id=0, tree_method='gpu_hist')),
    ('lgbm', LGBMClassifier(device='cpu')),
    ('catboost', CatBoostClassifier(task_type="GPU", silent=True))
]

stacking_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
accuracy, f1 = evaluate_model(stacking_model, X_train, y_train, X_val, y_val, "Stacking Classifier")
results['Stacking Classifier'] = {'Accuracy': accuracy, 'F1-Score': f1}

In [19]:
# 8. 앙상블 (Voting Classifier) - 다수결 방식
voting_model = VotingClassifier(estimators=estimators, voting='soft')
accuracy, f1 = evaluate_model(voting_model, X_train, y_train, X_val, y_val, "Voting Classifier")
results['Voting Classifier'] = {'Accuracy': accuracy, 'F1-Score': f1}

NameError: name 'estimators' is not defined

In [20]:
results

{'LightGBM': {'Accuracy': 0.30620467365028203, 'F1-Score': 0.2836375438316794},
 'CatBoost': {'Accuracy': 0.3311845286059629, 'F1-Score': 0.31329603145566004},
 'Gradient Boosting': {'Accuracy': 0.27074939564867045,
  'F1-Score': 0.26302509018130416},
 'Logistic Regression': {'Accuracy': 0.32232070910556004,
  'F1-Score': 0.30756259390309626}}

In [None]:
# 7. 성능 결과 로그에 저장
logging.info("All model performance comparisons completed.")
print("모든 모델 비교 완료. 성능 결과가 model_performance.log에 저장되었습니다.")

In [7]:
# # 5. 랜덤 포레스트 모델 학습 및 검증
# model = RandomForestClassifier(random_state=42)
# model.fit(X_train, y_train)

In [8]:
# # 검증
# y_pred = model.predict(X_val)
# print(f'Validation Accuracy: {accuracy_score(y_val, y_pred)}')

Validation Accuracy: 0.2884770346494762


In [9]:
# # 6. 모델 저장
# joblib.dump(model, 'random_forest_model.joblib')

['random_forest_model.joblib']

In [10]:
# 7. 테스트 데이터 인코딩 및 예측
test_encoded = test.copy()

# test 데이터에 동일한 인코딩 적용
for col in categorical_columns:
    mlb = mlb_dict[col]
    test_encoded[col] = test_encoded[col].astype(str).str.split(' ')
    test_encoded = test_encoded.join(pd.DataFrame(mlb.transform(test_encoded[col]),
                                                  columns=[f'{col}_{cls}' for cls in mlb.classes_],
                                                  index=test_encoded.index))
    test_encoded.drop(columns=[col], inplace=True)

In [11]:
# PCA 적용
test_encoded_pca = pca.transform(test_encoded)

In [12]:
joblib.dump(test_encoded_pca, 'test_encoded_pca.csv')

['test_encoded_pca.csv']

In [15]:
# 저장된 모델로 예측
model = joblib.load('random_forest_model.joblib')
predictions = model.predict(test_encoded_pca)

# 결과를 submission.csv로 저장
submission = pd.DataFrame({'ID': test.index, 'SUBCLASS': predictions})
submission.to_csv('submission.csv', index=False)