# <center>LLM - Detect AI Generated Text</center>

This competition challenges participants to develop a machine learning model that can accurately detect **whether an essay was written by a student or an LLM**. The competition dataset comprises a mix of student-written essays and essays generated by a variety of LLMs.

Team Members: 毛柏毅, 朱誼學, 許木羽, 張立誠

In [13]:
# %pip install transformers
# %pip install peft
# %pip install bitsandbytes
# %pip install accelerate
# %pip install omegaconf
# %pip install lightgbm

## Configuration

In [14]:
import transformers as T
from datasets import Dataset
import torch
# from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import torch.nn as nn
import torch.nn.functional as F
import kagglehub
import numpy as np

import os
import gc
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from IPython.display import display, HTML

In [15]:
from typing import Literal

HOST: Literal['Localhost', 'Interactive', 'Batch'] = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', 'Localhost')
IS_RERUN: bool = os.getenv('KAGGLE_IS_COMPETITION_RERUN')

print(f'HOST: {HOST}, IS_RERUN: {IS_RERUN}')

In [16]:
device = torch.device(
    ("cuda:3" if torch.cuda.is_available()
     else "mps" if torch.backends.mps.is_available()
     else "cpu"))

## Data

In [17]:
def get_kaggle_csv(dataset: str, name: str, is_comp: bool = False) -> pd.DataFrame:
    assert name.endswith('.csv')
    if IS_RERUN:
        return pd.read_csv(f'/kaggle/input/{dataset}/{name}')
    if is_comp:
        path = kagglehub.competition_download(dataset)
    else:
        path = kagglehub.dataset_download(dataset)
    return pd.read_csv(Path(path) / name)

In [18]:
if IS_RERUN:
    df_train = get_kaggle_csv('daigt-datamix', 'train_essays.csv')
    df_test = get_kaggle_csv('llm-detect-ai-generated-text', 'test_essays.csv', is_comp=True)
else:
    df_train = get_kaggle_csv('dogeon188/daigt-datamix', 'train_essays.csv')
    # split df_train into train and test
    df_train = df_train.sample(frac=1).reset_index(drop=True)
    df_test = df_train.iloc[-1000:]
    df_train = df_train.iloc[:10000]

## Model

### Preprocess Data

In [None]:
from transformers import PreTrainedTokenizerFast
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers

VOCAB_SIZE = 30000
LOWERCASE = True

raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(
    vocab_size=VOCAB_SIZE, special_tokens=special_tokens)

ds_test = Dataset.from_pandas(df_test[['text']])
# ds_train = Dataset.from_pandas(df_train[['text']])


def train_corp_iter():
    for i in range(0, len(ds_test), 1000):
        yield ds_test[i: i + 1000]["text"]
    # for i in range(0, len(ds_train), 1000):
    #     yield ds_train[i: i + 1000]["text"]


raw_tokenizer.train_from_iterator(
    train_corp_iter(),
    trainer=trainer,
    # length=len(ds_test) + len(ds_train)
)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

tokenized_texts_test = [tokenizer.tokenize(text)
                        for text in tqdm(df_test['text'])]
tokenized_texts_train = [tokenizer.tokenize(text)
                         for text in tqdm(df_train['text'])]






100%|██████████| 3/3 [00:00<00:00, 4016.25it/s]
100%|██████████| 99836/99836 [01:19<00:00, 1248.30it/s]


### TFIDF

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer


def dummy(x): return x


vectorizer = TfidfVectorizer(
    ngram_range=(3, 5), lowercase=False, sublinear_tf=True, analyzer='word',
    tokenizer=dummy, preprocessor=dummy,
    token_pattern=None, strip_accents='unicode')

vectorizer.fit(tqdm(tokenized_texts_test))

# Getting vocab
vocab = vectorizer.vocabulary_

vectorizer = TfidfVectorizer(
    ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
    analyzer='word', tokenizer=dummy, preprocessor=dummy,
    token_pattern=None, strip_accents='unicode'
)

tf_train = vectorizer.fit_transform(tqdm(tokenized_texts_train))
tf_test = vectorizer.transform(tqdm(tokenized_texts_test))

del vocab, vectorizer, tokenized_texts_train, tokenized_texts_test
gc.collect()

100%|██████████| 3/3 [00:00<00:00, 1041.63it/s]
100%|██████████| 99836/99836 [00:39<00:00, 2514.95it/s]
100%|██████████| 3/3 [00:00<00:00, 11115.65it/s]


10

In [21]:
y_train = df_train['source']


In [22]:
from lightgbm import LGBMClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha=0.02)

sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber")

LGB_N_ITER = 100 if IS_RERUN else 1
lgb = LGBMClassifier(
    n_estimators=LGB_N_ITER,
    num_leaves=51,
    objective='binary',
    metric='roc_auc',
    learning_rate=0.05,
    colsample_bytree=0.7,
    colsample_bynode=0.6,
    lambda_l1=8,
    lambda_l2=5,
    num_threads=4,
    min_data_in_leaf=10,
    max_depth=20,
    max_bin=900,
    verbose=-1,
)

clf.fit(tf_train, y_train)
human_idx = np.where(clf.classes_ == 'human')[0][0]
print("NB Done!")

sgd_model.fit(tf_train, y_train)
human_idx = np.where(sgd_model.classes_ == 'human')[0][0]
print("SGD Done!")

pbar = tqdm(total=LGB_N_ITER)
lgb.fit(tf_train, y_train, callbacks=[lambda x: pbar.update(1)])
human_idx = np.where(lgb.classes_ == 'human')[0][0]
print("LGBM Done!")

NB Done!
SGD Done!


100%|██████████| 25/25 [1:04:04<00:00, 153.79s/it]

LGBM Done!





## Evaluation

In [23]:
if not IS_RERUN:
    from sklearn.metrics import log_loss, roc_auc_score

    # examine the predictions
    p1 = clf.predict_proba(tf_test)
    p2 = sgd_model.predict_proba(tf_test)
    p3 = lgb.predict_proba(tf_test)
    final_preds = p1*0.1 + p2*0.45 + p3*0.45
    
    # calculate the final score
    mc_score = log_loss(df_test['source'], final_preds)

    p1 = p1[:, np.where(clf.classes_ == 'human')[0][0]]
    p2 = p2[:, np.where(sgd_model.classes_ == 'human')[0][0]]
    p3 = p3[:, np.where(lgb.classes_ == 'human')[0][0]]
    final_preds = 1 - p1*0.1 + p2*0.45 + p3*0.45
    auc_score = roc_auc_score(df_test['generated'], final_preds)

    print(f"Multiclass log loss: {mc_score:.4f}")
    print(f"ROC AUC: {auc_score:.4f}")

## Submission

In [24]:
p1 = clf.predict_proba(tf_test)[:, np.where(clf.classes_ == 'human')[0][0]]
p2 = sgd_model.predict_proba(tf_test)[:, np.where(sgd_model.classes_ == 'human')[0][0]]
p3 = lgb.predict_proba(tf_test)[:, np.where(lgb.classes_ == 'human')[0][0]]
final_preds = p1*0.1 + p2*0.45 + p3*0.45
final_preds = 1 - final_preds

df_test['generated'] = final_preds
submission = df_test[['id' if IS_RERUN else 'prompt_id', 'generated']]
submission.to_csv('submission.csv', index=False)