In [1]:
!pip install transformers datasets evaluate scikit-learn torch peft



In [2]:
!pip install tf-keras

Collecting tf-keras
  Using cached tf_keras-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow<2.20,>=2.19 (from tf-keras)
  Downloading tensorflow-2.19.0-cp312-cp312-win_amd64.whl.metadata (4.1 kB)
Collecting tensorboard~=2.19.0 (from tensorflow<2.20,>=2.19->tf-keras)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting ml-dtypes<1.0.0,>=0.5.1 (from tensorflow<2.20,>=2.19->tf-keras)
  Downloading ml_dtypes-0.5.1-cp312-cp312-win_amd64.whl.metadata (22 kB)
Downloading tf_keras-2.19.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ----- ---------------------------------- 0.2/1.7 MB 7.0 MB/s eta 0:00:01
   ----------- ---------------------------- 0.5/1.7 MB 6.2 MB/s eta 0:00:01
   ------------------ --------------------- 0.8/1.7 MB 6.5 MB/s eta 0:00:01
   ------------------------ --------------- 1.1/1.7 MB 6.2 MB/s eta 0:00:01
   ----------------------------- ---------- 1.3/1.7 MB 5.8 MB/s eta 0:00:0

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.18.0 requires ml-dtypes<0.5.0,>=0.4.0, but you have ml-dtypes 0.5.1 which is incompatible.
tensorflow-intel 2.18.0 requires tensorboard<2.19,>=2.18, but you have tensorboard 2.19.0 which is incompatible.


In [3]:
import kagglehub
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


### The bottom block was added

In [4]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
import evaluate
import numpy as np





In [5]:
# Download the dataset to the data directory
path = kagglehub.dataset_download("sunilthite/llm-detect-ai-generated-text-dataset")

In [6]:
print("Downloaded to:", path)
print(os.listdir(path))

Downloaded to: C:\Users\Brennan Thompson\.cache\kagglehub\datasets\sunilthite\llm-detect-ai-generated-text-dataset\versions\1
['Training_Essay_Data.csv']


In [7]:
df = pd.read_csv(os.path.join(path, "Training_Essay_Data.csv"))
df.head()

Unnamed: 0,text,generated
0,Car-free cities have become a subject of incre...,1
1,"Car Free Cities Car-free cities, a concept ga...",1
2,A Sustainable Urban Future Car-free cities ...,1
3,Pioneering Sustainable Urban Living In an e...,1
4,The Path to Sustainable Urban Living In an ...,1


In [8]:
df = df.rename(columns={'generated': 'label'})
print(f"Dataset shape: {df.shape}")
print(f"Column names: {df.columns.tolist()}")
print(f"Label distribution: {df['label'].value_counts()}")

Dataset shape: (29145, 2)
Column names: ['text', 'label']
Label distribution: label
0    17508
1    11637
Name: count, dtype: int64


In [9]:
# Split the data into train, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

In [10]:
print(f"Train set: {train_df.shape[0]} samples")
print(f"Validation set: {val_df.shape[0]} samples")
print(f"Test set: {test_df.shape[0]} samples")

Train set: 20401 samples
Validation set: 4372 samples
Test set: 4372 samples


In [11]:
print(f"Label distribution: {train_df['label'].value_counts()}")
print(f"Label distribution: {val_df['label'].value_counts()}")
print(f"Label distribution: {test_df['label'].value_counts()}")

Label distribution: label
0    12255
1     8146
Name: count, dtype: int64
Label distribution: label
0    2626
1    1746
Name: count, dtype: int64
Label distribution: label
0    2627
1    1745
Name: count, dtype: int64


In [12]:
# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

### Everything below here was added 

In [13]:
# Tokenizing and Processing

In [14]:
# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [15]:
# Tokenization function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

In [16]:
# Apply tokenization
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)


Map: 100%|██████████| 20401/20401 [00:14<00:00, 1453.50 examples/s]
Map: 100%|██████████| 4372/4372 [00:02<00:00, 1461.48 examples/s]
Map: 100%|██████████| 4372/4372 [00:03<00:00, 1433.49 examples/s]


In [17]:
# Drop raw text column
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_val = tokenized_val.remove_columns(["text"])
tokenized_test = tokenized_test.remove_columns(["text"])

In [18]:
# Rename labels column
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_val = tokenized_val.rename_column("label", "labels")
tokenized_test = tokenized_test.rename_column("label", "labels")

In [19]:
# Format to PyTorch tensors
tokenized_train.set_format("torch")
tokenized_val.set_format("torch")
tokenized_test.set_format("torch")

### Metrics

In [20]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels),
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")
    }

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 2.08MB/s]
Downloading builder script: 100%|██████████| 6.79k/6.79k [00:00<?, ?B/s]


# Model + Trainer Setup

In [21]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


OSError: bert-base-uncased does not appear to have a file named pytorch_model.bin but there is a file for TensorFlow weights. Use `from_tf=True` to load this model from those weights.

In [None]:
training_args = TrainingArguments(
    output_dir="./results_bert_kagglehub",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs_bert_kagglehub",
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


# Train + Evaluate

In [None]:
trainer.train()

# Evaluate on test set
test_metrics = trainer.evaluate(tokenized_test)
print("\nTest set metrics:")
for key, value in test_metrics.items():
    print(f"{key}: {value:.4f}")

# Raw code:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
import evaluate
import numpy as np

# ----------------------------
# TOKENIZATION + PREPROCESSING
# ----------------------------

# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Apply tokenization
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

# Drop raw text column
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_val = tokenized_val.remove_columns(["text"])
tokenized_test = tokenized_test.remove_columns(["text"])

# Rename labels column
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_val = tokenized_val.rename_column("label", "labels")
tokenized_test = tokenized_test.rename_column("label", "labels")

# Format to PyTorch tensors
tokenized_train.set_format("torch")
tokenized_val.set_format("torch")
tokenized_test.set_format("torch")

# ----------------------------
# METRICS
# ----------------------------

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels),
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")
    }

# ----------------------------
# MODEL + TRAINER SETUP
# ----------------------------

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./results_bert_kagglehub",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs_bert_kagglehub",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ----------------------------
# TRAIN & EVALUATE
# ----------------------------

trainer.train()

# Evaluate on test set
test_metrics = trainer.evaluate(tokenized_test)
print("\nTest set metrics:")
for key, value in test_metrics.items():
    print(f"{key}: {value:.4f}")
