In [6]:
import pandas as pd
import numpy as np

In [8]:
!ls ./data

sample_submission.csv  test_essays.csv	train_essays.csv  train_prompts.csv


In [9]:
train_df = pd.read_csv('./data/train_essays.csv')
test_df = pd.read_csv('./data/test_essays.csv')
prompt_df = pd.read_csv('./data/train_prompts.csv')

In [12]:
train_df[train_df['prompt_id'] != 0]
# train_df[train_df['generated'] != 0]

Unnamed: 0,id,prompt_id,text,generated
5,00da8c32,1,The electrol college system is an unfair syste...,0
6,011dc2bc,1,"Dear state senator, It is the utmost respect t...",0
8,01c6e176,1,"""It's official: The electoral college is unfai...",0
9,0202ddf9,1,The Electoral College has been kept for centur...,0
10,020a5d6d,1,"Dear senator, Retain the Electoral College. Th...",0
...,...,...,...,...
1360,fc66f374,1,The Electoral College was originally establish...,0
1361,fcb87d59,1,"Dear senator, I think that the presidential el...",0
1364,fcd93e2d,1,The electoral college is a group of electors t...,0
1366,fcfe84cb,1,An electoral College compromises between elect...,0


In [None]:
!pip install -q keras-core --upgrade
!pip install -q keras-nlp --upgrade
!pip install --upgrade -q wandb git+https://github.com/soumik12345/wandb-addons

In [None]:
import os
os.environ["KERAS_BACKEND"] = "torch"  # "jax" or "tensorflow" or "torch" 
# os.environ["WANDB_SILENT"] = "false" # for wandb

import keras_nlp
import keras_core as keras
import keras_core.backend as K


import torch
# import jax
import tensorflow as tf
# from tensorflow import keras
# import tensorflow.keras.backend as K

import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl

cmap = mpl.cm.get_cmap('coolwarm')

print("TensorFlow:", tf.__version__)
# print("JAX:", jax.__version__)
print("Keras:", keras.__version__)
print("KerasNLP:", keras_nlp.__version__)

# Configuration
class CFG: 
    verbose = 0
    
    wandb = False
    competition = 'llm-detect-ai-generated-text'
    _wandb_kernel = 'awsaf49'
    comment = 'DebertaV3-MaxSeq_200-ext_s-torch'
    
    preset = "deberta_v3_base_en"
    sequence_length = 200
    
    device = 'TPU'
    
    seed = 42
    
    num_folds = 5
    selected_folds = [0, 1, 2]
    
    epochs = 3
    batch_size = 3
    drop_remainder = True
    cache = True
    
    scheduler = 'cosine'
    
    class_names = ["real", "fake"]
    num_classes = len(class_names)
    class_labels = list(range(num_classes))
    label2name = dict(zip(class_labels, class_names))
    name2label = {v: k for k, v in label2name.items()}

keras.utils.set_random_seed(CFG.seed)

def get_device():
    "Detect and intializes GPU/TPU automatically"
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect() 
        strategy = tf.distribute.TPUStrategy(tpu)
        print(f'> Running on TPU', tpu.master(), end=' | ')
        print('Num of TPUs: ', strategy.num_replicas_in_sync)
        device=CFG.device
    
    except:
        gpus = tf.config.list_logical_devices('GPU')
        ngpu = len(gpus)
        
        if ngpu:
            strategy = tf.distribute.MirroredStrategy(gpus)
            print("> Running on GPU", end=' | ')
            print("Num of GPUs: ", ngpu)
            device='GPU'

        else:
            print("> Running on CPU")
            strategy = tf.distribute.get_strategy()
            device='CPU'
    
    return strategy, device

strategy, CFG.device = get_device()
CFG.replicas = strategy.num_replicas_in_sync

BASE_PATH = './data'

df = pd.read_csv(f'{BASE_PATH}/train_essays.csv')  # Read CSV file into a DataFrame
df['label'] = df.generated.copy()
df['name'] = df.generated.map(CFG.label2name)  # Map answer labels using name-to-label mapping

print("# Train Data: {:,}".format(len(df)))
print("# Sample:")
display(df.head(2))

plt.figure(figsize=(8, 4))
df.name.value_counts().plot.bar(color=[cmap(0.0), cmap(0.25), cmap(0.65), cmap(0.9), cmap(1.0)])
plt.xlabel("Class")
plt.ylabel("Count")
plt.title("Class distribution for Train Data")
plt.show()


ext_df1 = pd.read_csv(f'{BASE_PATH}/daigt-proper-train-dataset/train_drcat_02.csv')
ext_df2 = pd.read_csv(f'{BASE_PATH}/argugpt/argugpt.csv')[['id','text','model']]

ext_df2.rename(columns={'model':'source'}, inplace=True)
ext_df2['label'] = 1

ext_df = pd.concat([
    ext_df1[ext_df1.source=='persuade_corpus'].sample(10000),
    ext_df1[ext_df1.source!='persuade_corpus'],
    ext_df2,
])

# ext_real_df = ext_df[['id', 'text']].copy()
# ext_real_df['label']  = 0

# ext_fake_df = ext_df[['id', 'source_text']].copy()
# ext_fake_df.rename(columns={"source_text":"text"}, inplace=True)
# ext_fake_df['label']  = 1

# ext_df = pd.concat([ext_real_df, ext_fake_df], axis=0)
ext_df['name'] = ext_df.label.map(CFG.label2name)

print("# External Data: {:,}".format(len(ext_df)))
print("# Sample:")
ext_df.head(2)

plt.figure(figsize=(8, 4))
ext_df.name.value_counts().plot.bar(color=[cmap(0.0), cmap(0.65)])
plt.xlabel("Class")
plt.ylabel("Count")
plt.title("Answer distribution for External Data")
plt.show()

df = ext_df.copy().reset_index(drop=True) # pd.concat([ext_df, df], axis=0)
df.head()

In [None]:

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=CFG.num_folds, shuffle=True, random_state=CFG.seed)

df = df.reset_index(drop=True)

df['stratify'] = df.label.astype(str)+df.source.astype(str)

df["fold"] = -1

for fold, (train_idx, val_idx) in enumerate(skf.split(df, df['stratify'])):
    df.loc[val_idx, 'fold'] = fold

df.groupby(["fold", "name", "source"]).size()

preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset(
    preset=CFG.preset,
    sequence_length=CFG.sequence_length,
)

inp = preprocessor(df.text.iloc[0])

for k, v in inp.items():
    print(k, ":", v.shape)

def preprocess_fn(text, label=None):
    text = preprocessor(text)
    return (text, label) if label is not None else text

def build_dataset(texts, labels=None, batch_size=32,
                  cache=False, drop_remainder=True,
                  repeat=False, shuffle=1024):
    AUTO = tf.data.AUTOTUNE
    slices = (texts,) if labels is None else (texts, labels)
    ds = tf.data.Dataset.from_tensor_slices(slices)
    ds = ds.cache() if cache else ds
    ds = ds.map(preprocess_fn, num_parallel_calls=AUTO)
    ds = ds.repeat() if repeat else ds
    opt = tf.data.Options()

    if shuffle: 
        ds = ds.shuffle(shuffle, seed=CFG.seed)
        opt.experimental_deterministic = False

    ds = ds.with_options(opt)
    ds = ds.batch(batch_size, drop_remainder=drop_remainder)
    ds = ds.prefetch(AUTO)

    return ds

def get_datasets(fold):
    train_df = df[df.fold!=fold].sample(frac=1)
        
    train_texts = train_df.text.tolist()
    train_labels = train_df.label.tolist()
    
    train_ds = build_dataset(train_texts, train_labels,
                             batch_size=CFG.batch_size*CFG.replicas, cache=CFG.cache,
                             shuffle=True, drop_remainder=True, repeat=True)

    valid_df = df[df.fold==fold].sample(frac=1)
    valid_texts = valid_df.text.tolist()
    valid_labels = valid_df.label.tolist()
    
    valid_ds = build_dataset(valid_texts, valid_labels,
                             batch_size=min(CFG.batch_size*CFG.replicas, len(valid_df)), cache=CFG.cache,
                             shuffle=False, drop_remainder=True, repeat=False)
    
    return (train_ds, train_df), (valid_ds, valid_df)

In [None]:
import wandb

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("WANDB")
    wandb.login(key=api_key)
    anonymous = None
except:
    anonymous = 'must'
    wandb.login(anonymous=anonymous, relogin=True)

from wandb.keras import WandbMetricsLogger
from wandb.keras import WandbModelCheckpoint

def wandb_init(fold):
    config = {k: v for k, v in dict(vars(CFG)).items() if '__' not in k}
    config.update({"fold": int(fold)})
    run = wandb.init(project="llm-fake-text",
                     name=f"fold-{fold}|max_seq-{CFG.sequence_length}|model-{CFG.preset}",
                     config=config,
                     group=CFG.comment,
                     save_code=True)
    return run

def log_wandb():
    wandb.log({'best_auc': best_acc, 'best_loss': best_loss, 'best_epoch': best_epoch})

def get_wb_callbacks(fold):
    wb_ckpt = WandbModelCheckpoint(f'fold{fold}.keras',
                                   monitor='val_auc',
                                   save_best_only=True,
                                   save_weights_only=False,
                                   mode='max')
    wb_metr = wandb.keras.WandbMetricsLogger()
    return [wb_metr, wb_ckpt]

In [None]:

import math

def get_lr_callback(batch_size=8, mode='cos', epochs=10, plot=False):
    lr_start, lr_max, lr_min = 0.6e-6, 0.5e-6 * batch_size, 0.3e-6
    lr_ramp_ep, lr_sus_ep, lr_decay = 1, 0, 0.75

    def lrfn(epoch):
        if epoch < lr_ramp_ep: lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
        elif epoch < lr_ramp_ep + lr_sus_ep: lr = lr_max
        elif mode == 'exp': lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
        elif mode == 'step': lr = lr_max * lr_decay**((epoch - lr_ramp_ep - lr_sus_ep) // 2)
        elif mode == 'cos':
            decay_total_epochs, decay_epoch_index = epochs - lr_ramp_ep - lr_sus_ep + 3, epoch - lr_ramp_ep - lr_sus_ep
            phase = math.pi * decay_epoch_index / decay_total_epochs
            lr = (lr_max - lr_min) * 0.5 * (1 + math.cos(phase)) + lr_min
        return lr

    if plot:
        plt.figure(figsize=(10, 5))
        plt.plot(np.arange(epochs), [lrfn(epoch) for epoch in np.arange(epochs)], marker='o')
        plt.xlabel('epoch'); plt.ylabel('lr')
        plt.title('LR Scheduler')
        plt.show()

    return keras.callbacks.LearningRateScheduler(lrfn, verbose=False)

_=get_lr_callback(CFG.batch_size*CFG.replicas, plot=True)

def get_callbacks(fold):
    callbacks = []
    lr_cb = get_lr_callback(CFG.batch_size*CFG.replicas)
    ckpt_cb = keras.callbacks.ModelCheckpoint(f'fold{fold}.keras',
                                              monitor='val_auc',
                                              save_best_only=True,
                                              save_weights_only=False,
                                              mode='max')
    callbacks.extend([lr_cb, ckpt_cb])

    if CFG.wandb:
        wb_cbs = get_wb_callbacks(fold)
        callbacks.extend(wb_cbs) 
        
    return callbacks

In [None]:
def build_model():
    classifier = keras_nlp.models.DebertaV3Classifier.from_preset(
        CFG.preset,
        preprocessor=None,
        num_classes=1
    )
    inputs = classifier.input
    logits = classifier(inputs)
        
    outputs = keras.layers.Activation("sigmoid")(logits)
    model = keras.Model(inputs, outputs)
    
    model.compile(
        optimizer=keras.optimizers.AdamW(5e-6),
        loss=keras.losses.BinaryCrossentropy(label_smoothing=0.02),
        metrics=[
            keras.metrics.AUC(name="auc"),
        ],
        jit_compile=True
    )
    return model

model = build_model()

model.summary()

keras.utils.plot_model(model, show_shapes=True)

In [None]:
# Training
for fold in CFG.selected_folds:
    if CFG.wandb:
        run = wandb_init(fold)

    (train_ds, train_df), (valid_ds, valid_df) = get_datasets(fold)
    
    callbacks = get_callbacks(fold)

    print('#' * 50)
    print(f'\tFold: {fold + 1} | Model: {CFG.preset}\n\tBatch Size: {CFG.batch_size * CFG.replicas} | Scheduler: {CFG.scheduler}')
    print(f'\tNum Train: {len(train_df)} | Num Valid: {len(valid_df)}')
    print('#' * 50)
    
    K.clear_session()
    with strategy.scope():
        model = build_model()

    history = model.fit(
        train_ds,
        epochs=CFG.epochs,
        validation_data=valid_ds,
        callbacks=callbacks,
        steps_per_epoch=int(len(train_df) / CFG.batch_size / CFG.replicas),
    )
    
    best_epoch = np.argmax(model.history.history['val_auc'])
    best_auc = model.history.history['val_auc'][best_epoch]
    best_loss = model.history.history['val_loss'][best_epoch]

    print(f'\n{"=" * 17} FOLD {fold} RESULTS {"=" * 17}')
    print(f'>>>> BEST Loss  : {best_loss:.3f}\n>>>> BEST AUC   : {best_auc:.3f}\n>>>> BEST Epoch : {best_epoch}')
    print('=' * 50)
    
    if CFG.wandb:
        log_wandb()
        wandb.run.finish()

    print("\n\n")

In [None]:
# Predictions
predictions = model.predict(
    valid_ds,
    batch_size=min(CFG.batch_size * CFG.replicas * 2, len(valid_df)), # max batch size = valid size
    verbose=1
)

pred_answers = (predictions > 0.5).astype(int).squeeze()
true_answers = valid_df.label.values

print("# Predictions\n")
for i in range(5):
    row = valid_df.iloc[i]
    text  = row.text
    pred_answer = CFG.label2name[pred_answers[i]]
    true_answer = CFG.label2name[true_answers[i]]
    print(f"Txt {i+1}:\n{text[:100]} .... {text[-100:]}\n")
    print(f"True: {true_answer}\n")
    print(f"Pred: {pred_answer}\n")
    print("-"*90, "\n")  