# Import

In [10]:
%pip install pandas scikit-learn


Collecting pandas
  Downloading pandas-2.3.0-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp313-cp313-win_amd64.whl.metadata (14 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.3.1-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.0-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading pandas-2.3.0-cp313-cp313-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---- ---------------

In [12]:
# 첫 번째 셀에 추가 설치
%pip install xgboost


Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
    --------------------------------------- 2.4/150.0 MB 16.6 MB/s eta 0:00:09
    --------------------------------------- 2.6/150.0 MB 7.2 MB/s eta 0:00:21
    --------------------------------------- 2.6/150.0 MB 7.2 MB/s eta 0:00:21
    --------------------------------------- 3.7/150.0 MB 4.4 MB/s eta 0:00:33
    --------------------------------------- 3.7/150.0 MB 4.4 MB/s eta 0:00:33
   - -------------------------------------- 4.2/150.0 MB 3.5 MB/s eta 0:00:42
   - -------------------------------------- 5.0/150.0 MB 3.5 MB/s eta 0:00:42
   - -------------------------------------- 5.5/150.0 MB 3.4 MB/s eta 0:00:43
   - -------------------------------------- 5.5/150.0 MB 3.4 MB/s eta 0:00:43
   - -------------------------------------- 6.6/150.0 MB 3.1 MB/s eta 0:00:46


In [2]:
#베이스라인
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier


train = pd.read_csv('./train.csv', encoding='utf-8-sig')
test  = pd.read_csv('./test.csv',  encoding='utf-8-sig')
X = train[['title', 'full_text']]
y = train['generated']

X_train, X_val, y_train, y_val = train_test_split(
	X, y,
	stratify=y,
	test_size=0.1, # 0.2 ->-.1로 수정
	random_state=42
)


get_title = FunctionTransformer(lambda x: x['title'], validate=False)
get_text  = FunctionTransformer(lambda x: x['full_text'], validate=False)



vectorizer = FeatureUnion([
	('title', Pipeline([
	('selector', get_title),
	('tfidf', TfidfVectorizer(ngram_range=(1,2), max_features=45000)) #max_features를 3000 -> 450000
	])),
	('full_text', Pipeline([
	('selector', get_text),
	('tfidf', TfidfVectorizer(ngram_range=(1,3), max_features=20000)) #max_features를 10000 -> 20000
])




X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec   = vectorizer.transform(X_val)



xgb = XGBClassifier(random_state=42)
xgb.fit(X_train_vec, y_train)




val_probs = xgb.predict_proba(X_val_vec)[:, 1]
auc = roc_auc_score(y_val, val_probs)
print(f"Validation AUC: {auc:.4f}")



#5. Inference (테스트 데이터에 적용)

#컬럼명 통일
test = test.rename(columns={'paragraph_text': 'full_text'})

#입력 컬럼 선택
X_test = test[['title', 'full_text']]

#TF-IDF 변환 (fit이 아닌 transform만)
X_test_vec = vectorizer.transform(X_test)

#예측 확률 획득
probs = xgb.predict_proba(X_test_vec)[:, 1]





#6. Submission

sample_submission = pd.read_csv('./sample_submission.csv', encoding='utf-8-sig')
sample_submission['generated'] = probs
sample_submission.to_csv('./baseline_submission.csv', index=False)


Validation AUC: 0.9203


In [None]:
!pip uninstall -y xgboost
!pip install xgboost==1.7.6
import xgboost as xgb
print(xgb.__version__)

Found existing installation: xgboost 1.7.6
Uninstalling xgboost-1.7.6:
  Successfully uninstalled xgboost-1.7.6
Collecting xgboost==1.7.6
  Using cached xgboost-1.7.6-py3-none-win_amd64.whl.metadata (1.9 kB)
Using cached xgboost-1.7.6-py3-none-win_amd64.whl (70.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.7.6
1.7.6


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

# Data Load & Split

In [None]:
train = pd.read_csv('./train.csv', encoding='utf-8-sig')
test = pd.read_csv('./test.csv', encoding='utf-8-sig')


In [None]:
X = train[['title', 'full_text']]
y = train['generated']
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# TF-IDF Vectorization

In [None]:
# TF-IDF 벡터화
get_title = FunctionTransformer(lambda x: x['title'], validate=False)
get_text = FunctionTransformer(lambda x: x['full_text'], validate=False)


feats = FeatureUnion([ # ADD
    ('title', Pipeline([
        ('sel', get_title),
        ('tfidf', TfidfVectorizer(
            ngram_range=(1,3), max_features=5000,
            min_df=5, max_df=0.9, stop_words='english'
        ))
    ])),
    ('full_text', Pipeline([
        ('sel', get_text),
        ('tfidf', TfidfVectorizer(
            ngram_range=(1,3), max_features=20000,
            min_df=5, max_df=0.9, stop_words='english'
        ))
    ])),
])

vectorizer = Pipeline([
    ('feats', feats),
    ('svd', TruncatedSVD(n_components=300, random_state=42))
])

# 피처 변환
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

# Train

In [None]:
# 3) XGBoost 모델 정의 & 학습
xgb = XGBClassifier(
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    use_label_encoder=False,
    eval_metric='auc',
    random_state=42,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.7,
    colsample_bytree=0.7,
    n_estimators=2000,
)

xgb.fit(
    X_train_vec, y_train,
    eval_set=[(X_val_vec, y_val)],
    early_stopping_rounds=50,
    verbose=20
)

val_probs = xgb.predict_proba(X_val_vec)[:, 1]
print("Validation AUC:", roc_auc_score(y_val, val_probs))



[0]	validation_0-auc:0.73189
[20]	validation_0-auc:0.79552
[40]	validation_0-auc:0.80613
[60]	validation_0-auc:0.81331
[80]	validation_0-auc:0.82094
[100]	validation_0-auc:0.82852
[120]	validation_0-auc:0.83084
[140]	validation_0-auc:0.83307
[160]	validation_0-auc:0.83343
[180]	validation_0-auc:0.83468
[200]	validation_0-auc:0.83651
[220]	validation_0-auc:0.83857
[240]	validation_0-auc:0.83795
[260]	validation_0-auc:0.83848
[280]	validation_0-auc:0.83883
[300]	validation_0-auc:0.83816
[320]	validation_0-auc:0.83895
[325]	validation_0-auc:0.83900
Validation AUC: 0.8393475135348245


# Inference

In [None]:
# 4) Inference & Submission
test = test.rename(columns={'paragraph_text': 'full_text'})
X_test_vec = vectorizer.transform(test[['title', 'full_text']])
probs = xgb.predict_proba(X_test_vec)[:, 1]

sub = pd.read_csv('./sample_submission.csv', encoding='utf-8-sig')
sub['generated'] = probs
sub.to_csv('./baseline_submission_improved.csv', index=False)

# Submission