# TF-IDF + XGBoost 개선 실험

**개선 사항:**
- 하이퍼파라미터 튜닝 추가
- 교차 검증 적용
- 피처 개수 조정
- 성능 비교 분석

# Import

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier
import matplotlib.pyplot as plt

# Data Load & Split

In [None]:
train = pd.read_csv('./train.csv', encoding='utf-8-sig')
test = pd.read_csv('./test.csv', encoding='utf-8-sig')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Generated 비율: {train['generated'].mean():.3f}")

In [None]:
X = train[['title', 'full_text']]
y = train['generated']
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# 개선된 TF-IDF Vectorization

In [None]:
# 피처 개수 늘리고 다양한 n-gram 시도
get_title = FunctionTransformer(lambda x: x['title'], validate=False)
get_text = FunctionTransformer(lambda x: x['full_text'], validate=False)

# 개선된 벡터화 (더 많은 피처와 3-gram까지)
vectorizer = FeatureUnion([
    ('title', Pipeline([('selector', get_title),
                        ('tfidf', TfidfVectorizer(ngram_range=(1,3), max_features=5000, 
                                                min_df=2, max_df=0.95))])),
    ('full_text', Pipeline([('selector', get_text), 
                            ('tfidf', TfidfVectorizer(ngram_range=(1,3), max_features=15000,
                                                    min_df=2, max_df=0.95))])),
])

# 피처 변환
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

print(f"피처 차원: {X_train_vec.shape[1]}")

# 베이스라인 모델 (비교용)

In [None]:
# 기본 XGBoost (베이스라인)
xgb_baseline = XGBClassifier(random_state=42)
xgb_baseline.fit(X_train_vec, y_train)

val_probs_baseline = xgb_baseline.predict_proba(X_val_vec)[:, 1]
auc_baseline = roc_auc_score(y_val, val_probs_baseline)
print(f"베이스라인 Validation AUC: {auc_baseline:.4f}")

# 하이퍼파라미터 튜닝된 모델

In [None]:
# 개선된 하이퍼파라미터
xgb_improved = XGBClassifier(
    n_estimators=200,           # 트리 개수 증가
    max_depth=6,                # 깊이 조정
    learning_rate=0.1,          # 학습률 조정
    subsample=0.8,              # 서브샘플링
    colsample_bytree=0.8,       # 피처 서브샘플링
    reg_alpha=0.1,              # L1 정규화
    reg_lambda=1.0,             # L2 정규화
    random_state=42,
    n_jobs=-1
)

xgb_improved.fit(X_train_vec, y_train)

val_probs_improved = xgb_improved.predict_proba(X_val_vec)[:, 1]
auc_improved = roc_auc_score(y_val, val_probs_improved)
print(f"개선된 모델 Validation AUC: {auc_improved:.4f}")
print(f"성능 향상: {auc_improved - auc_baseline:.4f}")

# 교차 검증으로 안정성 확인

In [None]:
# 5-fold 교차 검증
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 전체 훈련 데이터로 교차 검증
X_full_vec = vectorizer.fit_transform(X)
cv_scores = cross_val_score(xgb_improved, X_full_vec, y, 
                           cv=cv, scoring='roc_auc', n_jobs=-1)

print(f"교차 검증 AUC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
print(f"개별 점수: {cv_scores}")

# 성능 비교 시각화

In [None]:
# 모델 성능 비교
models = ['베이스라인', '개선된 모델']
scores = [auc_baseline, auc_improved]

plt.figure(figsize=(8, 6))
bars = plt.bar(models, scores, color=['lightblue', 'orange'], alpha=0.7)
plt.ylim(0.85, max(scores) + 0.01)
plt.ylabel('Validation AUC')
plt.title('모델 성능 비교')
plt.grid(True, alpha=0.3)

# 점수 표시
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001,
             f'{score:.4f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

# Inference (최고 성능 모델 사용)

In [None]:
# 최종 모델로 전체 데이터 재훈련
final_model = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)

# 전체 훈련 데이터로 학습
X_full_vec = vectorizer.fit_transform(X)
final_model.fit(X_full_vec, y)

# 테스트 데이터 예측
test = test.rename(columns={'paragraph_text': 'full_text'})
X_test = test[['title', 'full_text']]
X_test_vec = vectorizer.transform(X_test)

probs = final_model.predict_proba(X_test_vec)[:, 1]
print(f"예측 완료. 예측값 범위: [{probs.min():.3f}, {probs.max():.3f}]")

# Submission

In [None]:
sample_submission = pd.read_csv('./sample_submission.csv', encoding='utf-8-sig')
sample_submission['generated'] = probs

# 개선된 결과 저장
sample_submission.to_csv('./improved_submission.csv', index=False)
print("개선된 제출 파일 저장 완료: improved_submission.csv")

# 성능 개선 요약
print(f"\n=== 성능 개선 요약 ===")
print(f"베이스라인 AUC: {auc_baseline:.4f}")
print(f"개선된 AUC: {auc_improved:.4f}")
print(f"개선폭: +{auc_improved - auc_baseline:.4f}")
print(f"교차검증 AUC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")