# Feature vector를 input data로 사용하는 XGBoost 모델

In [None]:
# Importing Libraries
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt

# Sklearn Metrics
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    roc_curve,
    auc
)

## 모델 학습

In [1]:
# 공통 경로 설정
base_path = "./MODEL" # 사용자 지정

In [None]:
# 파일 경로 설정
train_feature_file = f"{base_path}/FeatureVector/train_features.npy"
train_label_file = f"{base_path}/FeatureVector/train_labels.npy"
dev_feature_file = f"{base_path}/FeatureVector/dev_features.npy"
dev_label_file = f"{base_path}/FeatureVector/dev_labels.npy"
test_feature_file = f"{base_path}/FeatureVector/test_features.npy"
test_label_file = f"{base_path}/FeatureVector/test_labels.npy"

output_csv = f"{base_path}/FeatureVector/prediction_output.csv"   # 결과 저장 경로

# Train 및 Dev 데이터 로드
train_features = np.load(train_feature_file)
train_labels = np.load(train_label_file)
dev_features = np.load(dev_feature_file)
dev_labels = np.load(dev_label_file)

# Train-Dev 합치기
X_train = np.vstack((train_features, dev_features))
y_train = np.hstack((train_labels, dev_labels))

# Test 데이터 로드
X_test = np.load(test_feature_file)
y_test = np.load(test_label_file)

# XGBoost 모델 학습
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
model.fit(X_train, y_train)

# Test 데이터 예측
predictions = model.predict(X_test)
prediction_probabilities = model.predict_proba(X_test)[:, 1]  # class 1의 확률 (probabilities)

# 원래 label과 예측된 label을 묶어서 저장
output_data = pd.DataFrame({
    "label": y_test,
    "predicted_label": predictions,
    "probability": prediction_probabilities
})
output_data.to_csv(output_csv, index=False)

print(f"Prediction results saved to {output_csv}")


# 모델 성능 평가

In [None]:
# Classification report

# 원래 라벨과 예측된 라벨 추출
true_labels = output_data['label']
predicted_labels = output_data['predicted_label']

# Classification Report 생성
report = classification_report(true_labels, predicted_labels, target_names=['Class 0', 'Class 1'])

# 출력
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

     Class 0       0.96      0.92      0.94      8054
     Class 1       0.92      0.96      0.94      7963

    accuracy                           0.94     16017
   macro avg       0.94      0.94      0.94     16017
weighted avg       0.94      0.94      0.94     16017

In [None]:
# ROC Curve 계산 및 그리기
fpr, tpr, thresholds = roc_curve(y_test, prediction_probabilities)
roc_auc = auc(fpr, tpr)

# 그래프 그리기
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

print(f"AUC (Area Under the Curve): {roc_auc:.4f}")