In [1]:
# 📚 라이브러리 불러오기
import pandas as pd
import re
import time
from konlpy.tag import Okt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

okt = Okt()

def tokenize_morphs(text):
    return " ".join(okt.morphs(str(text)))

def extract_features(text):
    text = str(text)
    features = {
        "length_chars": len(text),
        "length_words": len(text.split()),
        "num_commas": text.count(","),
        "num_periods": text.count("."),
        "avg_word_len": sum(len(w) for w in text.split()) / len(text.split()) if text.split() else 0,
        "num_uppercase": sum(c.isupper() for c in text),
        "num_digits": sum(c.isdigit() for c in text),
        "num_punctuations": len(re.findall(r'[^\w\s]', text)),
    }
    return pd.Series(features)


In [2]:
# 🔹 데이터 불러오기
df = pd.read_csv("open/train.csv")

print("형태소 분석 중...")
df["full_text_morph"] = df["full_text"].apply(tokenize_morphs)
df_features = df["full_text"].apply(extract_features)

print("TF-IDF 벡터화 중...")
vectorizer = TfidfVectorizer(max_features=2000)
X_tfidf = vectorizer.fit_transform(df["full_text_morph"])

X_all = pd.concat([pd.DataFrame(X_tfidf.toarray()), df_features.reset_index(drop=True)], axis=1)
y_all = df["generated"]

# 🔹 AI:사람 = 1:2 비율 샘플링
df_ai = X_all[df["generated"] == 1]
df_human = X_all[df["generated"] == 0].sample(n=len(df_ai)*2, random_state=42)
X_balanced = pd.concat([df_ai, df_human])
y_balanced = pd.Series([1]*len(df_ai) + [0]*len(df_human))

X_train, X_val, y_train, y_val = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced)


형태소 분석 중...
TF-IDF 벡터화 중...


In [3]:
print("모델 학습 중...")
model = XGBClassifier(
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    random_state=42,
    tree_method="hist"
)
model.fit(X_train, y_train)

y_pred_proba = model.predict_proba(X_val)[:, 1]
roc_score = roc_auc_score(y_val, y_pred_proba)
print("✅ Validation ROC AUC:", round(roc_score, 5))


모델 학습 중...
✅ Validation ROC AUC: 0.93042


In [4]:
test_df = pd.read_csv("open/test.csv")
test_df = test_df.rename(columns={"paragraph_text": "full_text"})

print("테스트셋 형태소 분석 중...")
test_df["full_text_morph"] = test_df["full_text"].apply(tokenize_morphs)
test_features = test_df["full_text"].apply(extract_features)
test_tfidf = vectorizer.transform(test_df["full_text_morph"])
test_all = pd.concat([pd.DataFrame(test_tfidf.toarray()), test_features.reset_index(drop=True)], axis=1)

print("테스트셋 예측 중...")
test_probs = model.predict_proba(test_all)[:, 1]

submission = pd.read_csv("open/sample_submission.csv")
submission["generated"] = test_probs
submission.to_csv("submission_best_tfidf_tuned.csv", index=False)
print("🎉 제출 완료: submission_best_tfidf_tuned.csv")


테스트셋 형태소 분석 중...
테스트셋 예측 중...
🎉 제출 완료: submission_best_tfidf_tuned.csv
