In [1]:
import transformers as T
from datasets import Dataset
import torch
# from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import torch.nn as nn
import torch.nn.functional as F
import kagglehub
import numpy as np

import os
import gc
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from IPython.display import display, HTML

In [2]:
from typing import Literal

HOST: Literal['Localhost', 'Interactive', 'Batch'] = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', 'Localhost')
IS_RERUN: bool = os.getenv('KAGGLE_IS_COMPETITION_RERUN')

print(f'HOST: {HOST}, IS_RERUN: {IS_RERUN}')

HOST: Batch, IS_RERUN: None


In [3]:
device = torch.device(
    ("cuda:3" if torch.cuda.is_available()
     else "mps" if torch.backends.mps.is_available()
     else "cpu"))

In [4]:
def get_kaggle_csv(dataset: str, name: str, is_comp: bool = False) -> pd.DataFrame:
    assert name.endswith('.csv')
    if IS_RERUN:
        return pd.read_csv(f'/kaggle/input/{dataset}/{name}')
    if is_comp:
        path = kagglehub.competition_download(dataset)
    else:
        path = kagglehub.dataset_download(dataset)
    return pd.read_csv(Path(path) / name)

In [5]:
if IS_RERUN:
    df_train = get_kaggle_csv('daigt-datamix', 'train_essays.csv')
    df_test = get_kaggle_csv('llm-detect-ai-generated-text', 'test_essays.csv', is_comp=True)
else:
    df_train = get_kaggle_csv('dogeon188/daigt-datamix', 'train_essays.csv')
    # split df_train into train and test
    df_train = df_train.sample(frac=1).reset_index(drop=True)
    df_test = df_train.iloc[-1000:]
    df_train = df_train.iloc[:10000]

In [6]:
from transformers import PreTrainedTokenizerFast
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers

VOCAB_SIZE = 30000
LOWERCASE = False

raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(
    vocab_size=VOCAB_SIZE, special_tokens=special_tokens)

ds_test = Dataset.from_pandas(df_test[['text']])
# ds_train = Dataset.from_pandas(df_train[['text']])


def train_corp_iter():
    for i in range(0, len(ds_test), 1000):
        yield ds_test[i: i + 1000]["text"]
    # for i in range(0, len(ds_train), 1000):
    #     yield ds_train[i: i + 1000]["text"]


raw_tokenizer.train_from_iterator(
    train_corp_iter(),
    trainer=trainer,
    # length=len(ds_test) + len(ds_train)
)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

tokenized_texts_test = [tokenizer.tokenize(text)
                        for text in tqdm(df_test['text'])]
tokenized_texts_train = [tokenizer.tokenize(text)
                         for text in tqdm(df_train['text'])]










100%|██████████| 1000/1000 [00:01<00:00, 647.33it/s]
100%|██████████| 10000/10000 [00:16<00:00, 593.58it/s]


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer


def dummy(x): return x


vectorizer = TfidfVectorizer(
    ngram_range=(3, 5), lowercase=False, sublinear_tf=True, analyzer='word',
    tokenizer=dummy, preprocessor=dummy,
    token_pattern=None, strip_accents='unicode')

vectorizer.fit(tqdm(tokenized_texts_test))

# Getting vocab
vocab = vectorizer.vocabulary_

vectorizer = TfidfVectorizer(
    ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
    analyzer='word', tokenizer=dummy, preprocessor=dummy,
    token_pattern=None, strip_accents='unicode'
)

tf_train = vectorizer.fit_transform(tqdm(tokenized_texts_train))
tf_test = vectorizer.transform(tqdm(tokenized_texts_test))


del vocab, vectorizer, tokenized_texts_train, tokenized_texts_test
gc.collect()

100%|██████████| 1000/1000 [00:01<00:00, 610.39it/s]
100%|██████████| 10000/10000 [00:13<00:00, 726.17it/s]
100%|██████████| 1000/1000 [00:01<00:00, 721.54it/s]


0

In [8]:
y_train = df_train['generated']

In [9]:
from lightgbm import LGBMClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha=0.02)

sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber")

LGB_N_ITER = 100 if IS_RERUN else 1
lgb = LGBMClassifier(
    n_estimators=LGB_N_ITER,
    num_leaves=51,
    objective='binary',
    metric='roc_auc',
    learning_rate=0.05,
    colsample_bytree=0.7,
    colsample_bynode=0.6,
    lambda_l1=8,
    lambda_l2=5,
    num_threads=4,
    min_data_in_leaf=10,
    max_depth=20,
    max_bin=900,
    verbose=-1,
)

clf.fit(tf_train, y_train)
p1 = clf.predict_proba(tf_test)[:, 1]
print("NB Done!")

sgd_model.fit(tf_train, y_train)
p2 = sgd_model.predict_proba(tf_test)[:, 1]
print("SGD Done!")

pbar = tqdm(total=LGB_N_ITER)
lgb.fit(tf_train, y_train, callbacks=[lambda _: pbar.update(1)])
p3 = lgb.predict_proba(tf_test)[:, 1]
print("LGBM Done!")
pbar.close()

final_preds = p1 * 0.1 + p2 * 0.45 + p3 * 0.45

NB Done!
SGD Done!


100%|██████████| 1/1 [00:31<00:00, 31.46s/it]

LGBM Done!





In [10]:
if not IS_RERUN:
    from sklearn.metrics import roc_auc_score

    auc_score = roc_auc_score(df_test['generated'], final_preds)
    
    print(f"ROC AUC: {auc_score:.4f}")

ROC AUC: 0.8984


In [11]:
df_test['generated'] = final_preds
submission = df_test[['id' if IS_RERUN else 'prompt_id', 'generated']]
submission.to_csv('submission.csv', index=False)