# Import

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

# Data Load & Split

In [None]:
# 🔗 대용량 파일 자동 다운로드
import gdown
import os

# 파일 ID 설정 (Google Drive 공개 링크에서 추출)
TRAIN_FILE_ID = "1teA9GmYlIsutaDLWvCCsLeh7833t-TC_"
TEST_FILE_ID = "1bGC_YWtNUOroHARfmzCjrL8oPcvb7Tpw"
SAMPLE_FILE_ID = "1ebrHVj-CtM-7aEz4OqP-bCHlazle-PKM"

def download_from_drive(file_id, filename):
    if file_id and not os.path.exists(filename):
        url = f'https://drive.google.com/uc?id={file_id}'
        print(f"📥 {filename} 다운로드 중...")
        gdown.download(url, filename, quiet=False)
        print(f"✅ {filename} 다운로드 완료!")
    elif os.path.exists(filename):
        print(f"✅ {filename} 이미 존재함")

# 자동 다운로드 실행
download_from_drive(TRAIN_FILE_ID, 'train.csv')
download_from_drive(TEST_FILE_ID, 'test.csv')
download_from_drive(SAMPLE_FILE_ID, 'sample_submission.csv')

# CSV 파일 읽기
train = pd.read_csv('./train.csv', encoding='utf-8-sig')
test = pd.read_csv('./test.csv', encoding='utf-8-sig')

In [3]:
X = train[['title', 'full_text']]
y = train['generated']
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# TF-IDF Vectorization

In [4]:
# TF-IDF 벡터화
get_title = FunctionTransformer(lambda x: x['title'], validate=False)
get_text = FunctionTransformer(lambda x: x['full_text'], validate=False)

vectorizer = FeatureUnion([
    ('title', Pipeline([('selector', get_title),
                        ('tfidf', TfidfVectorizer(ngram_range=(1,2), max_features=3000))])),
    ('full_text', Pipeline([('selector', get_text), 
                            ('tfidf', TfidfVectorizer(ngram_range=(1,2), max_features=10000))])),
])

# 피처 변환
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

# Train

In [None]:
# 모델 정의
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train_vec, y_train)

val_probs = xgb.predict_proba(X_val_vec)[:, 1]
auc = roc_auc_score(y_val, val_probs)
print(f"Validation AUC: {auc:.4f}")

# Inference

In [6]:
# test용으로 'paragraph_text'를 'full_text'에 맞게 재명명
test = test.rename(columns={'paragraph_text': 'full_text'})
X_test = test[['title', 'full_text']]

X_test_vec = vectorizer.transform(X_test)

probs = xgb.predict_proba(X_test_vec)[:, 1]

# Submission

In [None]:
sample_submission = pd.read_csv('./sample_submission.csv', encoding='utf-8-sig')
sample_submission['generated'] = probs

sample_submission.to_csv('./baseline_submission.csv', index=False)

# 파일 다운로드 (코랩에서)
try:
    from google.colab import files
    files.download('baseline_submission.csv')
    print(f"✅ 파일 다운로드 시작: baseline_submission.csv")
except ImportError:
    print("로컬 환경에서 실행 중입니다. 파일이 현재 디렉터리에 저장되었습니다.")