## In this notebook, an LLM detection classification pipeline has been developed.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from torch.optim import AdamW
from transformers import get_scheduler
from sklearn.model_selection import train_test_split
import datasets
from huggingface_hub import login
from sklearn import metrics
import ast
import nltk
nltk.download('punkt')

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

[nltk_data] Downloading package punkt to /home/anra7539/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Importing datasets and preprocessing

In [2]:
preliminary_data = datasets.load_dataset("AnkushRaut216/llm_generated_text")
argugpt = datasets.load_dataset("AnkushRaut216/argugpt_data")

# Converting huggingface datasets to pandas df for better processing
preliminary_data_pd = preliminary_data['train'].to_pandas()
argugpt_pd = argugpt['train'].to_pandas()

# Adding label to the argugpt data
argugpt_pd.drop_duplicates(subset = ['text'], inplace = True)
argugpt_pd['generated'] = [1]*len(argugpt_pd)

### Defining the classifier

In [5]:
model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"

device = "cuda"
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                             device_map = device,
                                             cache_dir='/scratch/alpine/anra7539').to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name, truncation = True)

config = AutoConfig.from_pretrained(f"distilbert/distilbert-base-uncased-finetuned-sst-2-english")

In [6]:
train_essays = pd.concat([preliminary_data_pd[['text', 'generated']], 
                          argugpt_pd[['text', 'generated']]], ignore_index = True)

In [7]:
train_val, test = train_test_split(train_essays, test_size = 0.1, 
                              stratify = train_essays['generated'],
                              random_state = 2024)

train, val = train_test_split(train_val, test_size = 0.25, 
                              stratify = train_val['generated'],
                             random_state = 2024)

train.reset_index(drop = True, inplace = True)
test.reset_index(drop = True, inplace = True)
val.reset_index(drop = True, inplace = True)

In [8]:
# login()
# datasets.Dataset.from_pandas(test).push_to_hub("AnkushRaut216/test_essays", private = True)

In [9]:
tokenized_inputs_train = tokenizer(list(train['text']), padding=True, truncation=True, 
                             return_tensors="pt")
labels_train = torch.tensor(list(train['generated']))

tokenized_inputs_val = tokenizer(list(val['text']), padding=True, truncation=True, 
                             return_tensors="pt")
labels_val = torch.tensor(list(val['generated']))

dataset_train = TensorDataset(tokenized_inputs_train["input_ids"], 
                        tokenized_inputs_train["attention_mask"], labels_train)
dataloader = DataLoader(dataset_train, batch_size=32, shuffle=True)

dataset_val = TensorDataset(tokenized_inputs_val["input_ids"], 
                        tokenized_inputs_val["attention_mask"], labels_val)
val_dataloader = DataLoader(dataset_val, batch_size=32, shuffle=True)

In [53]:
num_epochs = 10

optimizer = AdamW(model.parameters(), lr=1e-5)
num_training_steps = num_epochs * len(dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, 
    num_training_steps=num_training_steps
)

early_stopping_rounds = 3  
best_validation_loss = float('inf')
no_improvement_count = 0

progress_bar = tqdm(range(num_training_steps))

  0%|          | 0/370 [00:00<?, ?it/s]

In [54]:
for epoch in range(num_epochs):
    model.train()
    for i,batch in enumerate(dataloader):
        input_ids, attention_mask, label = batch
        input_ids, attention_mask, label = input_ids.to("cuda"), attention_mask.to("cuda"), label.to("cuda") 

        outputs = model(input_ids, attention_mask=attention_mask, labels=label)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
    # model.save_pretrained('scratch/alpine/anra7539/llm_detector_v2/best_model')
    # tokenizer.save_pretrained('scratch/alpine/anra7539/llm_detector_v2/best_model')
    
    model.eval()
    with torch.no_grad():
        val_loss = 0
        for batch in val_dataloader:
            input_ids, attention_mask, label = batch
            input_ids, attention_mask, label = input_ids.to("cuda"), attention_mask.to("cuda"), label.to("cuda") 

            outputs = model(input_ids, attention_mask=attention_mask, labels=label)
            val_loss += outputs.loss.item()
            
        if val_loss < best_validation_loss:
            best_validation_loss = val_loss
            no_improvement_count = 0
            
            model.save_pretrained('/scratch/alpine/anra7539/llm_detector_v7/best_model')
            tokenizer.save_pretrained('/scratch/alpine/anra7539/llm_detector_v7/best_model')
        else:
            no_improvement_count += 1


        if no_improvement_count >= early_stopping_rounds:
            print(f'Early stopping after {epoch+1} epochs with no improvement.')
            break
            


 70%|███████   | 259/370 [01:41<00:38,  2.92it/s]

Early stopping after 7 epochs with no improvement.


In [10]:
load_path = "/scratch/alpine/anra7539/llm_detector_v7/best_model"

model = AutoModelForSequenceClassification.from_pretrained(load_path, device_map = device).to(device)
tokenizer = AutoTokenizer.from_pretrained(load_path, truncation = True)

In [11]:
from tqdm import tqdm

predictions_train = []
predictions_val = []
predictions_test = []

with torch.no_grad():
    for text in train.text:
        
        input_ids = tokenizer(text, 
                      padding=True, 
                      truncation=True,  
                      return_tensors='pt')["input_ids"].to("cuda")
        
        outputs = model(input_ids)
        predictions_train.append(torch.nn.functional.sigmoid(outputs.logits))
        
    for text in val.text:
        
        input_ids = tokenizer(text, 
                      padding=True, 
                      truncation=True,  
                      return_tensors='pt')["input_ids"].to("cuda")
        
        outputs = model(input_ids)
        predictions_val.append(torch.nn.functional.sigmoid(outputs.logits))
        
    for text in test.text:
        
        input_ids = tokenizer(text, 
                      padding=True, 
                      truncation=True,  
                      return_tensors='pt')["input_ids"].to("cuda")
        
        outputs = model(input_ids)
        predictions_test.append(torch.nn.functional.sigmoid(outputs.logits))

In [12]:
preds_train = [np.array(a.to("cpu"))[0][1] for a in predictions_train]
preds_val = [np.array(a.to("cpu"))[0][1] for a in predictions_val]
preds_test = [np.array(a.to("cpu"))[0][1] for a in predictions_test]

In [13]:
fpr, tpr, _ = metrics.roc_curve(train['generated'],  preds_train)
auc = metrics.roc_auc_score(train['generated'], preds_train)

In [14]:
# Selecting threshold by averaging the thresholds that give a tpr above 0.9 and an fpr below 0.1 on training data

threshold_df = pd.DataFrame({"threshold":_, "fpr":fpr, "tpr":tpr})
threshold = np.mean(threshold_df[(threshold_df.tpr>0.8) & (threshold_df.fpr<0.2)].threshold)

In [15]:
predictions_train_dis = np.where(preds_train>=threshold,1,0)
predictions_val_dis = np.where(preds_val>=threshold,1,0)
predictions_test_dis = np.where(preds_test>=threshold,1,0)

In [16]:
print(f"Test accuracy = {metrics.accuracy_score(test.generated, predictions_test_dis)}")

Test accuracy = 0.9884393063583815


In [17]:
print(metrics.classification_report(test.generated, predictions_test_dis))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       138
           1       1.00      0.94      0.97        35

    accuracy                           0.99       173
   macro avg       0.99      0.97      0.98       173
weighted avg       0.99      0.99      0.99       173



## Testing watermarked essays

In [18]:
watermarked_essays = pd.read_csv('all_watermarked_essays.csv')

In [20]:
watermarked_essays['text'] = [" ".join(ast.literal_eval(a)) for a in watermarked_essays.text]

In [21]:
watermarked_essays['original_text'] = test['text']
watermarked_essays['generated'] = test['generated']

In [22]:
watermarked_essays.text = np.where(watermarked_essays.generated == 1, watermarked_essays.text,
                                   watermarked_essays.original_text)

In [23]:
predictions_on_watermarked = []

with torch.no_grad():
    for text in watermarked_essays.text:
        
        input_ids = tokenizer(text, 
                      padding=True, 
                      truncation=True,  
                      return_tensors='pt')["input_ids"].to("cuda")
        
        outputs = model(input_ids)
        predictions_on_watermarked.append(torch.nn.functional.sigmoid(outputs.logits))

In [24]:
preds_watermarked = [np.array(a.to("cpu"))[0][1] for a in predictions_on_watermarked]

In [25]:
predictions_watermarked_dis = np.where(preds_watermarked>=threshold,1,0)

In [26]:
print(f"Accuracy on data with watermarked_text = {metrics.accuracy_score(watermarked_essays.generated, predictions_watermarked_dis)}")

Accuracy on data with watermarked_text = 0.9248554913294798


In [27]:
print(metrics.classification_report(watermarked_essays.generated, predictions_watermarked_dis))

              precision    recall  f1-score   support

           0       0.91      1.00      0.96       138
           1       1.00      0.63      0.77        35

    accuracy                           0.92       173
   macro avg       0.96      0.81      0.86       173
weighted avg       0.93      0.92      0.92       173



The recall for non-AI generated essays stays the same.

In [28]:
def count_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return len(sentences)

print(f"Average number of sentences in watermarked essays = {np.mean([count_sentences(text) for text in watermarked_essays[watermarked_essays.generated == 1].text])}")
print(f"Average number of sentences in original AI-generated essays = {np.mean([count_sentences(text) for text in test[test.generated == 1].text])}")

Average number of sentences in watermarked essays = 70.62857142857143
Average number of sentences in original AI-generated essays = 17.771428571428572


In [29]:
print(f"Standard deviation of number of sentences in watermarked essays = {np.std([count_sentences(text) for text in watermarked_essays[watermarked_essays.generated == 1].text])}")
print(f"Standard deviation of number of sentences in original AI-generated essays = {np.std([count_sentences(text) for text in test[test.generated == 1].text])}")

Standard deviation of number of sentences in watermarked essays = 89.88725818308994
Standard deviation of number of sentences in original AI-generated essays = 14.57803771683519


### Testing watermarked essays v2

In [30]:
watermarked_essays = pd.read_csv('all_watermarked_essays_v2.csv')

In [31]:
watermarked_essays['text'] = [" ".join(ast.literal_eval(a)) for a in watermarked_essays.text]

In [32]:
watermarked_essays['original_text'] = test['text']
watermarked_essays['generated'] = test['generated']

In [33]:
watermarked_essays.text = np.where(watermarked_essays.generated == 1, watermarked_essays.text,
                                   watermarked_essays.original_text)

In [34]:
predictions_on_watermarked = []

with torch.no_grad():
    for text in watermarked_essays.text:
        
        input_ids = tokenizer(text, 
                      padding=True, 
                      truncation=True,  
                      return_tensors='pt')["input_ids"].to("cuda")
        
        outputs = model(input_ids)
        predictions_on_watermarked.append(torch.nn.functional.sigmoid(outputs.logits))

In [35]:
preds_watermarked = [np.array(a.to("cpu"))[0][1] for a in predictions_on_watermarked]

In [36]:
predictions_watermarked_dis = np.where(preds_watermarked>=threshold,1,0)

In [37]:
print(f"Accuracy on data with watermarked_text = {metrics.accuracy_score(watermarked_essays.generated, predictions_watermarked_dis)}")

Accuracy on data with watermarked_text = 0.9595375722543352


In [38]:
print(metrics.classification_report(watermarked_essays.generated, predictions_watermarked_dis))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98       138
           1       1.00      0.80      0.89        35

    accuracy                           0.96       173
   macro avg       0.98      0.90      0.93       173
weighted avg       0.96      0.96      0.96       173



In [39]:
print(f"Average number of sentences in watermarked essays = {np.mean([count_sentences(text) for text in watermarked_essays[watermarked_essays.generated == 1].text])}")
print(f"Average number of sentences in original AI-generated essays = {np.mean([count_sentences(text) for text in test[test.generated == 1].text])}")

Average number of sentences in watermarked essays = 23.02857142857143
Average number of sentences in original AI-generated essays = 17.771428571428572


In [40]:
print(f"Standard deviation of number of sentences in watermarked essays = {np.std([count_sentences(text) for text in watermarked_essays[watermarked_essays.generated == 1].text])}")
print(f"Standard deviation of number of sentences in original AI-generated essays = {np.std([count_sentences(text) for text in test[test.generated == 1].text])}")

Standard deviation of number of sentences in watermarked essays = 13.824379529937502
Standard deviation of number of sentences in original AI-generated essays = 14.57803771683519
