In [1]:
# Load packages
import tensorflow as tf
import keras_nlp
import wandb
import numpy as np
import pandas as pd
import os
import kagglehub
from sklearn.model_selection import train_test_split
from tensorflow import keras


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Initialize Weights & Biases
wandb.init(project='llm_human_classifier')

# Download and load dataset
path = kagglehub.dataset_download("sunilthite/llm-detect-ai-generated-text-dataset")
df = pd.read_csv(os.path.join(path, "Training_Essay_Data.csv"))

# Standardize label column name
df = df.rename(columns={'generated': 'label'})

print(f"Dataset shape: {df.shape}")
print(f"Column names: {df.columns.tolist()}")
print(f"Label distribution: {df['label'].value_counts()}")

# Train/Val/Test split: 70/15/15 with stratification
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

# Extract input text and labels for each split
X_train, y_train = train_df["text"].tolist(), train_df["label"].tolist()
X_val, y_val = val_df["text"].tolist(), val_df["label"].tolist()
X_test, y_test = test_df["text"].tolist(), test_df["label"].tolist()


wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: thompsonbrennan66 (thompsonbrennan66-united-states-air-force) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


Dataset shape: (29145, 2)
Column names: ['text', 'label']
Label distribution: label
0    17508
1    11637
Name: count, dtype: int64


In [3]:
from keras_nlp.models import DebertaV3Tokenizer

preprocessor = DebertaV3Tokenizer.from_preset("hf://microsoft/deberta-v3-base")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


ValueError: KerasHub has no converter for huggingface/transformers models with model type `'deberta-v2'`.

In [None]:
# Tokenizer and Model Configuration
preprocessor = keras_nlp.models.DebertaV3Tokenizer.from_preset("deberta_v3_base")
max_length = 512

train_encodings = preprocessor(train_texts.to_list(), max_length=max_length)
test_encodings = preprocessor(test_texts.to_list(), max_length=max_length)

train_labels = np.array(train_labels)
test_labels = np.array(test_labels)


In [None]:
# Model Architecture
inputs = keras.Input(shape=(), dtype=tf.string)
encoded_inputs = preprocessor(inputs)
backbone = keras_nlp.models.DebertaV3Backbone.from_preset("deberta_v3_base")

x = backbone(encoded_inputs)["pooled_output"]
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(1, activation='sigmoid')(x)

model = keras.Model(inputs, x)


In [None]:
# Compile Model
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=2e-5),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)


In [None]:
# Callbacks
callbacks = [
    keras.callbacks.ModelCheckpoint("best_model.keras", save_best_only=True),
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
]


In [None]:
# Training
history = model.fit(
    train_texts, train_labels,
    validation_data=(test_texts, test_labels),
    epochs=5,
    batch_size=16,
    callbacks=callbacks
)


In [None]:
# Logging to Weights & Biases
wandb.log({"final_accuracy": history.history["val_accuracy"][-1]})

# Save Model
model.save("deberta_llm_classifier.keras")
