In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, log_loss

import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
local_dir = Path(os.getcwd())
kaggle_dir = Path('kaggle/input/')

In [3]:
notebook_dir = local_dir

### Load data

In [4]:
data_dir = notebook_dir / 'llm-detect-ai-generated-text'
external_data_dir = notebook_dir / 'external-data'

In [5]:
# Competition data
train_essays = pd.read_csv(data_dir / 'train_essays.csv')
train_essays['dataset'] = 'competition'
train_prompts = pd.read_csv(data_dir / 'train_prompts.csv')
test_essays = pd.read_csv(data_dir / 'test_essays.csv')

In [6]:
# AI generated data
train_essays_ai = pd.read_csv(data_dir / 'train_essays_ai.csv')
train_essays_ai['dataset'] = 'ai-generated'

In [7]:
training_data = train_essays.copy()
training_data = pd.concat([training_data, train_essays_ai[['id', 'prompt_id', 'text', 'generated', 'dataset', 'model']]], axis=0)
training_data.rename(columns={'model': 'source'}, inplace=True)
training_data['source'].fillna('competition', inplace=True)

### External Data

In [8]:
# Persuade data
persuade_data = pd.read_csv(external_data_dir / 'Persuade' / 'persuade_2.csv')
persuade_data = persuade_data[['prompt_name', 'text']]
persuade_data.rename(columns={'prompt_name': 'prompt_id'}, inplace=True)
persuade_data['generated'] = 0
persuade_data['dataset'] = 'persuade'
persuade_data['source'] = persuade_data['prompt_id']

In [9]:
# School Work data
school_work_data = pd.read_csv(external_data_dir / 'SchoolWork' / 'school_work.csv')
school_work_data.drop(columns=['id'], inplace=True)

In [10]:
# H3 data
h3_data = pd.read_json(external_data_dir / 'H3' / 'H3.jsonl', lines=True)
h3_data = h3_data[['human_answers', 'chatgpt_answers', 'source']]
h3_data.rename(columns={'human_answers': 'human', 'chatgpt_answers': 'ai_generated'}, inplace=True)
h3_data['dataset'] = 'H3'

In [11]:
# M4 data
m4_files = os.listdir(external_data_dir / 'M4')
m4_data = []
for file in m4_files:
    if file.startswith('arxiv'):
        file_data = pd.read_json(external_data_dir / 'M4' / file, lines=True)
        file_data['file'] = file
        file_data['source'] = 'arxiv'
        m4_data.append(file_data)
    elif file.startswith('wikipedia'):
        file_data = pd.read_json(external_data_dir / 'M4' / file, lines=True)
        file_data['file'] = file
        file_data['source'] = 'wikipedia'
        m4_data.append(file_data)
        
m4_data = pd.concat(m4_data)
m4_data.rename(columns={'human_text': 'human', 'machine_text': 'ai_generated'}, inplace=True)

m4_data = m4_data[['human', 'ai_generated', 'source']]
m4_data.dropna(inplace=True)
m4_data['dataset'] = 'M4'

In [12]:
# Sherbold data
sherbold_data = pd.read_csv(external_data_dir / 'sherbold-chatgpt-student-essay-study' / 'data' / 'essays-without-markers.csv', sep=';', encoding='UTF-8')
sherbold_data.rename(columns={'Student': 'human', 'ChatGPT-4': 'ai_generated'}, inplace=True)
sherbold_data = sherbold_data[['human', 'ai_generated']]
sherbold_data['source'] = 'sherbold'
sherbold_data['dataset'] = 'sherbold'

In [13]:
def convert_list_to_string(df):
    new_df = df.copy()
    for col in new_df.columns:
        new_df[col] = new_df[col].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
        
    return new_df

def format_external_data(data):
    human = data[['human', 'source', 'dataset']].copy()
    human.rename(columns={'human': 'text'}, inplace=True)
    human['generated'] = 0
    
    ai = data[['ai_generated', 'source', 'dataset']].copy()
    ai.rename(columns={'ai_generated': 'text'}, inplace=True)
    ai['generated'] = 1
    
    joined = pd.concat([human, ai])
    joined['prompt_id'] = -1
    joined = joined[['prompt_id', 'text', 'generated', 'dataset']]
    joined = convert_list_to_string(joined)
    
    joined = joined[joined['text'] != '']
    joined.dropna(inplace=True)
    joined.reset_index(inplace=True, drop=True)
    
    return convert_list_to_string(joined)

In [14]:
training_data = pd.concat([training_data, persuade_data, school_work_data], axis=0)

for dataset in [h3_data, m4_data, sherbold_data]:
    training_data = pd.concat([training_data, format_external_data(dataset)], axis=0)
training_data.reset_index(drop=True, inplace=True)

## Train DeBERTa

In [23]:
MODEL_ARCHITECTURE = 'microsoft/deberta-v3-xsmall'
INPUT_LENGTH = 1024
EPOCHS = 5
BATCH_SIZE = 8
LEARNING_RATE = 1e-5

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ARCHITECTURE)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ARCHITECTURE)
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
class TorchDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [18]:
persuade_train_prompts = ['Car-free cities', 'Driverless cars', 'The Face on Mars', 'Exploring Venus', 'Does the electoral college work?', 'Facial action coding system', '"A Cowboy Who Rode the Waves"']

In [19]:
# Define original data to be in train and validation sets
original_training_data = training_data[
    (training_data['dataset'] == 'competition') | 
    ((training_data['dataset'] == 'ai-generated') & (training_data['prompt_id'].isin(['0', '1'])))
]

original_validation_data = training_data[
    (training_data['dataset'] == 'ai-generated') & 
    (training_data['prompt_id'].isin(persuade_train_prompts))
]

# Split persuade data based on prompts anticipated to be in competition test set
persuade_data = training_data[training_data['dataset'] == 'persuade']
persuade_train_prompts = ['Car-free cities', 'Driverless cars', 'The Face on Mars', 'Exploring Venus', 'Does the electoral college work?', 'Facial action coding system', '"A Cowboy Who Rode the Waves"']
persuade_training_data = persuade_data[persuade_data['prompt_id'].isin(persuade_train_prompts)]
persuade_validation_data = persuade_data[~persuade_data['prompt_id'].isin(persuade_train_prompts)]
persuade_training_data = persuade_training_data.sample(frac=0.1, random_state=42)
persuade_validation_data = persuade_validation_data.sample(frac=0.1, random_state=42)

# Split out additional training data
additional_training_data = training_data[
    (training_data['dataset'] != 'competition') & 
    (training_data['dataset'] != 'ai-generated') &
    (training_data['dataset'] != 'persuade')
]

In [20]:
# Split original training data into training and validation data
model_training_data, model_validation_data = train_test_split(original_training_data, test_size=0.25, random_state=42)

# Add persuade data to training and validation data
model_training_data = pd.concat([model_training_data, persuade_training_data], axis=0)
model_validation_data = pd.concat([model_validation_data, original_validation_data, persuade_validation_data], axis=0)
model_training_data.reset_index(drop=True, inplace=True)
model_validation_data.reset_index(drop=True, inplace=True)

In [24]:
# Add Linear layer to model and send to device
model.classifier = torch.nn.Linear(model.classifier.in_features, 2)
model.to(device)

# Create validation dataloader
validation_torch = TorchDataset(model_validation_data['text'], model_validation_data['generated'], tokenizer, INPUT_LENGTH)
validation_dataloader = DataLoader(validation_torch, batch_size=BATCH_SIZE, shuffle=False)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
for epoch in range(EPOCHS):  # Set a suitable number of epochs
    model.train()
    
    # Add additional training data to training data
    n_samples = int(len(model_training_data) / 2)
    additional_training_data_sample = additional_training_data.sample(n=n_samples)
    train_data = pd.concat([model_training_data, additional_training_data_sample], axis=0, ignore_index=True)
    train_torch = TorchDataset(train_data['text'], train_data['generated'], tokenizer, INPUT_LENGTH)
    train_dataloader = DataLoader(train_torch, batch_size=BATCH_SIZE, shuffle=True)
    
    # Progress bar
    total_width = 25
    print('Epoch {}/{} [{}] {:.2f}% - Training Loss: {}'.format(epoch, EPOCHS, ' '*total_width, 0, 0), end='\r')
    
    # Training phase
    total_loss = 0
    batch_num = 0
    batches_num = len(train_dataloader)
    
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        # Update batch count and calculate average loss
        batch_num += 1
        avg_loss = total_loss / batch_num

        # Progress bar
        percent_completed = batch_num / batches_num
        bars = int(percent_completed * total_width)
        spaces = 25 - bars
        print('Epoch {}/{} [{}{}] {:.2f}% - Training Loss: {:.4f}'.format(epoch, EPOCHS, '='*int(bars), ' '*int(spaces), percent_completed*100, avg_loss), end='\r')


    print('{}'.format(' '*int(500)), end='\r') # clear progress bar
    print(f'Epoch {epoch}/{EPOCHS} - Training Loss: {avg_loss:.4f}')
    
    # Validation phase
    model.eval()
    
    all_labels = []
    all_predictions = []
    for batch in validation_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        
        logits = outputs.logits
        predictions = torch.nn.functional.softmax(logits, dim=-1)[:, 1]  # Get probabilities for the positive class
        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predictions.cpu().numpy())

    # Compute Log Loss and ROC AUC
    log_loss_score = log_loss(all_labels, all_predictions)
    roc_auc = roc_auc_score(all_labels, all_predictions)
    print(f'Validation Log Loss: {log_loss_score:.4f} - ROC AUC: {roc_auc:.4f}')
    
    

Epoch 0/5 [                         ] 3.00% - Training Loss: 0.6923

KeyboardInterrupt: 

In [130]:
# Validation phase
model.eval()

all_labels = []
all_predictions = []
for batch in validation_dataloader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    predictions = torch.nn.functional.softmax(logits, dim=-1)[:, 1]  # Get probabilities for the positive class
    all_labels.extend(labels.cpu().numpy())
    all_predictions.extend(predictions.cpu().numpy())

# Compute Log Loss and ROC AUC
log_loss_score = log_loss(all_labels, all_predictions)
roc_auc = roc_auc_score(all_labels, all_predictions)
print(f'Validation ROC AUC: {roc_auc}')

RuntimeError: Placeholder storage has not been allocated on MPS device!

In [106]:
val_loss

tensor([[ 3.5985, -3.1306],
        [-3.7707,  3.3330],
        [-3.7994,  3.3930],
        [-3.8172,  3.3911],
        [ 2.2318, -1.9290],
        [ 3.9097, -3.3366],
        [-3.7991,  3.3547],
        [-3.8044,  3.3574]], device='mps:0')

In [19]:
torch.save(model.state_dict(), 'models/deberta_baseline.bin')

In [20]:
test_model = torch.load('models/deberta_baseline.bin')

In [23]:
model.save_pretrained('models/deberta_baseline')

In [52]:
def predict(text, tokenizer, model, max_length):
    # Ensure the model is in evaluation mode and on CPU
    model.eval()
    model.to(device)
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True, padding="max_length")
    
    # Move tensors to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Make prediction
    with torch.no_grad():
        outputs = model(**inputs)

    # Convert logits to probabilities
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    return probabilities

In [53]:
def predict_on_cpu(text, tokenizer, model, max_length):
    # Ensure the model is in evaluation mode and on CPU
    model.eval()
    model.to('cpu')

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True, padding="max_length")

    # Move tensors to CPU
    inputs = {k: v.cpu() for k, v in inputs.items()}

    # Make prediction
    with torch.no_grad():
        outputs = model(**inputs)

    # Convert logits to probabilities
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    return probabilities


In [34]:
model_validation_data[model_validation_data['generated'] == 0]['text'].iloc[0]

'I agree that changing to election by popular vote for the president of the United States would be a better choice than to continue Electoral College. You may ask yourself why. Well, it\'s not fair to have to pick a random person to do something you should be the one doing. It\'s like saying you pick your neighbor to clean your house. Your house is still being cleaned, but not by you, instead, a random person you don\'t even know. Thus, continuing Electoral College is the same because in this case, the elector you pick and don\'t even know is going to make choices that at the end of the day you might not even agree on and you can\'t do nothing about it at that point.\n\nDuring electoral College voters get confused about the electors and vote for the wrong candidate. This means, there is too many names on the list that you most likely have never even heard in your life, therefore leading you to voting for the wrong candidate. Some electors aren\'t even faithful, instead of going for the

In [58]:
original_training_data[(original_training_data['dataset'] == 'competition') & (original_training_data['generated'] == 1)]

Unnamed: 0,id,prompt_id,text,generated,dataset,source
704,82131f68,1,"This essay will analyze, discuss and prove one...",1,competition,competition
740,86fe4f18,1,I strongly believe that the Electoral College ...,1,competition,competition
1262,eafb8a56,0,"Limiting car use causes pollution, increases c...",1,competition,competition


In [62]:
model = AutoModelForSequenceClassification.from_pretrained('models/test_model')

In [64]:
model_validation_data[model_validation_data['source'] == 'competition']['text'].iloc[0]

'I agree that changing to election by popular vote for the president of the United States would be a better choice than to continue Electoral College. You may ask yourself why. Well, it\'s not fair to have to pick a random person to do something you should be the one doing. It\'s like saying you pick your neighbor to clean your house. Your house is still being cleaned, but not by you, instead, a random person you don\'t even know. Thus, continuing Electoral College is the same because in this case, the elector you pick and don\'t even know is going to make choices that at the end of the day you might not even agree on and you can\'t do nothing about it at that point.\n\nDuring electoral College voters get confused about the electors and vote for the wrong candidate. This means, there is too many names on the list that you most likely have never even heard in your life, therefore leading you to voting for the wrong candidate. Some electors aren\'t even faithful, instead of going for the

In [67]:
model = AutoModelForSequenceClassification.from_pretrained('models/deberta_baseline')

In [73]:
# save tokenizer
tokenizer.save_pretrained('models/deberta_tokenizer')

('models/deberta_tokenizer/tokenizer_config.json',
 'models/deberta_tokenizer/special_tokens_map.json',
 'models/deberta_tokenizer/spm.model',
 'models/deberta_tokenizer/added_tokens.json',
 'models/deberta_tokenizer/tokenizer.json')

In [69]:

model_validation_data[model_validation_data['source'] == 'competition']['text'].iloc[5]

'The Electoral College should be abolished because citizens can\'t vote directly on which candidate they want. The electoral college is unfair because a candidate that wins the vote of the people, they can\'t become president if they don\'t have enough electoral votes. For example when Al Gore was elected, Gore won the popular vote but could not become president because of insufficient electoral college votes.\n\nIt is not fair to the people that their vote cannot count unless it goes through the Electoral College. The Electoral College bases on what the overall state vote is, instead of the individual voter, thus making it unfair to each voter if they do not get an equal say in who they want as president. Said in source two, "Under the Electoral College system, voters vote not for the president, but for a slate of electors, who in turn elect the president." The citizens should be able to choose who governs their country. Direct elections are much easier than having an Electoral Colleg

In [68]:
for i in range(10):

    text_to_predict = model_validation_data[model_validation_data['source'] == 'competition']['text'].iloc[i]
    true_value = model_validation_data[model_validation_data['source'] == 'competition']['generated'].iloc[i]
    
    probabilities = predict_on_cpu(text_to_predict, tokenizer, model, INPUT_LENGTH)

    # Assuming you're predicting binary class, get probability for the positive class
    probability_positive_class = probabilities[0, 1].item()
    print("Probability of being generated text:", probability_positive_class, "True value:", true_value)


Probability of being generated text: 0.0034894279669970274 True value: 0
Probability of being generated text: 0.018041841685771942 True value: 0
Probability of being generated text: 0.005819542799144983 True value: 0
Probability of being generated text: 0.09260311722755432 True value: 0
Probability of being generated text: 0.005961323156952858 True value: 0
Probability of being generated text: 0.0067536779679358006 True value: 0
Probability of being generated text: 0.0049910759553313255 True value: 0
Probability of being generated text: 0.032570913434028625 True value: 0
Probability of being generated text: 0.003801137674599886 True value: 0
Probability of being generated text: 0.12539352476596832 True value: 0


In [47]:
model = AutoModelForSequenceClassification.from_pretrained('models/deberta_baseline')
model.to(device)

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 384, padding_idx=0)
      (LayerNorm): LayerNorm((384,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=384, out_features=384, bias=True)
              (key_proj): Linear(in_features=384, out_features=384, bias=True)
              (value_proj): Linear(in_features=384, out_features=384, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-07, elementwise_affine

In [72]:
text_to_predict = """From an artist's perspective, streaming platforms provide unique opportunities. They offer a level playing field where both established and emerging artists can't reach global audiences without the traditional barriers of the music industry. This democratization has led to the discovery of new talents and diversified the music scene. While there are valid concerns regarding the revenue share and compensation for artists, the exposure and audience engagement these platforms provide are unparalleled. Streaming services also continually evolve their business models to address these concerns, indicating a positive trajectory for artist compensation."""


probabilities = predict(text_to_predict, tokenizer, model, INPUT_LENGTH)
print(i, end='\r')

# Assuming you're predicting binary class, get probability for the positive class
probability_positive_class = probabilities[0, 1].item()
print("Probability of being generated text:", probability_positive_class)

Probability of being generated text: 0.9987456798553467


In [74]:
school_work = pd.read_csv(notebook_dir / 'external-data' / 'SchoolWork' / 'school_work.csv')

In [76]:
for p in ['models/deberta_baseline', 'models/test_model']:
    model = AutoModelForSequenceClassification.from_pretrained(p)
    probs = []
    for i, row in school_work.iterrows():
        text_to_predict = row['text']
        probabilities = predict_on_cpu(text_to_predict, tokenizer, model, INPUT_LENGTH)
        school_work.loc[i, 'probability'] = probabilities[0, 1].item()
        predict_proba = probabilities[0, 1].item()
        probs.append(predict_proba)
    print(probs)
    print(p, np.mean(probs))

[0.03435050696134567, 0.6802866458892822, 0.31053900718688965, 0.9831629991531372, 0.038362450897693634, 0.9600358605384827, 0.8826023936271667, 0.9009636044502258, 0.8143059015274048, 0.9965040683746338, 0.7934333086013794, 0.9547764658927917, 0.9610163569450378, 0.9479411244392395, 0.9045681953430176, 0.6042554974555969, 0.6808803081512451, 0.9443067312240601, 0.05708974972367287, 0.061988312751054764, 0.9008781313896179, 0.02312665991485119, 0.04241969808936119, 0.19519157707691193, 0.9974948167800903, 0.9882182478904724, 0.015881212428212166, 0.16377094388008118]
models/deberta_baseline 0.6013696705922484
[0.0016236385563388467, 0.07646387815475464, 0.06092625856399536, 0.9573656320571899, 0.0012614544248208404, 0.17942999303340912, 0.9507336020469666, 0.36756181716918945, 0.8159099817276001, 0.9998658895492554, 0.013205821625888348, 0.9808155298233032, 0.9998730421066284, 0.9977096319198608, 0.5718616247177124, 0.4564564526081085, 0.0038780204486101866, 0.04579993709921837, 0.0472