In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset,DataLoader
from transformers import AutoTokenizer, AutoModel, AdamW, get_scheduler
from sklearn.metrics import roc_auc_score

In [2]:
def seed_everything(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
seed_everything(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
input_path = '/kaggle/input/llm-detect-ai-generated-text'
train_essays = pd.read_csv(f'{input_path}/train_essays.csv')
test_essays = pd.read_csv(f'{input_path}/test_essays.csv')
print(train_essays.shape)
print(test_essays.shape)
train_essays.sample(5)

(1378, 4)
(3, 3)


Unnamed: 0,id,prompt_id,text,generated
803,921654a9,1,"Dear Florida state senator, I highly suggest w...",0
245,30da029c,0,"From riding horses with wagons to, driving car...",0
57,0b89ae3e,0,"They come in red, blue, green, black, and whit...",0
869,9cf53db9,1,Every four years there is an election. The ele...,0
175,223bbf18,0,When limiting car usage the first thing that m...,0


In [4]:
datamix_input_path = '/kaggle/input/ai-mix-v26'
datamix_train = pd.read_parquet(f'{datamix_input_path}/train_essays.parquet')
datamix_valid = pd.read_parquet(f'{datamix_input_path}/valid_essays.parquet')
print(datamix_train.shape)
print(datamix_valid.shape)
datamix_train.sample(5)

(165767, 4)
(1679, 4)


Unnamed: 0,id,prompt_id,text,generated
40679,e_2y1lr2ru,6,Studying Venus is a worthy pursuit despite the...,0
150862,e_wau89tqz,12,Asking multiple people for advice is good beca...,0
144713,e_eoj8qbus,7,Many will say that distance learning is a real...,1
79090,e_268at02t,2,The face on Mars is just 3D illusion. It is no...,1
20210,e_mqc5z204,1,"Dear senator,\n\nI highly suggest you to recon...",1


In [5]:
datamix_valid.iloc[0]

id                                                  e_5padvwnu
prompt_id                                                    0
text         _,_ _and it has to do with the fact that if yo...
generated                                                    1
Name: 0, dtype: object

In [6]:
class EssayDataSet(Dataset):
    def __init__(self, data_path):
        self.data = pd.read_parquet(data_path)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data.iloc[idx]

train_data = EssayDataSet(f'{datamix_input_path}/train_essays.parquet')
valid_data = EssayDataSet(f'{datamix_input_path}/valid_essays.parquet')

In [7]:
checkpoint = 'microsoft/deberta-v3-small'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def collate_fn(batch_samples):
    X = tokenizer([batch_sample['text'] for batch_sample in batch_samples], padding=True,truncation=True, max_length=512,
                 return_tensors="pt")
    y = torch.tensor([batch_sample['generated'] for batch_sample in batch_samples])
    return X, y
    

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [8]:
tokenizer.save_pretrained('bert-tokenizer')

('bert-tokenizer/tokenizer_config.json',
 'bert-tokenizer/special_tokens_map.json',
 'bert-tokenizer/spm.model',
 'bert-tokenizer/added_tokens.json',
 'bert-tokenizer/tokenizer.json')

In [9]:
train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_data, batch_size=16, shuffle=False, collate_fn=collate_fn)

In [10]:
#next(iter(train_dataloader))[0]['input_ids'].shape

In [11]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class ClassifierModel(nn.Module):
    def __init__(self, checkpoint):
        super(ClassifierModel, self).__init__()
        self.bert_model = AutoModel.from_pretrained(checkpoint)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 1)
        self.pool = MeanPooling()
        
    def encode(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert_model(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_hidden_states=False
        )

        encoder_layer = outputs.last_hidden_state
        embeddings = self.pool(encoder_layer, attention_mask)

        return embeddings
        
    def forward(self, x):
        bert_output = self.encode(**x)
        x = self.dropout(bert_output)
        x = self.classifier(x)
        return x

model = ClassifierModel(checkpoint).to(device)

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

In [12]:
#loss_fn = nn.CrossEntropyLoss()
loss_fn = nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=0.0001)



In [13]:
def train_loop(train_dataloader, model, loss_fn, optimizer):
    total_loss = 0
    size = len(train_dataloader)
    model.train()
    
    for X, y in train_dataloader:
        X,y = X.to(device), y.to(device).float()
        pred = model(X)
        loss = loss_fn(pred, y.unsqueeze(1))
        total_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    return total_loss/size
'''
def test_loop(valid_dataloader, model, loss_fn):
    loss = 0
    correct = 0
    dataloader_size = len(valid_dataloader)
    model.eval()
    
    y_true = []
    y_pred = []
    with torch.no_grad():
        for (X,y) in valid_dataloader:
            X,y = X.to(device), y.to(device)
            pred = model(X)
            loss += loss_fn(pred, y).item()
            
            ## todo: update y_true
            y_true.extend(y.cpu().numpy().flatten())
            y_pred.extend(pred.argmax(1).cpu().numpy().flatten())
            ## todo: update y_pred
    
    auc = roc_auc_score(y_true, y_pred)
    loss /= dataloader_size
    return loss, auc
'''

'\ndef test_loop(valid_dataloader, model, loss_fn):\n    loss = 0\n    correct = 0\n    dataloader_size = len(valid_dataloader)\n    model.eval()\n    \n    y_true = []\n    y_pred = []\n    with torch.no_grad():\n        for (X,y) in valid_dataloader:\n            X,y = X.to(device), y.to(device)\n            pred = model(X)\n            loss += loss_fn(pred, y).item()\n            \n            ## todo: update y_true\n            y_true.extend(y.cpu().numpy().flatten())\n            y_pred.extend(pred.argmax(1).cpu().numpy().flatten())\n            ## todo: update y_pred\n    \n    auc = roc_auc_score(y_true, y_pred)\n    loss /= dataloader_size\n    return loss, auc\n'

In [14]:
import torch  
from sklearn.metrics import roc_auc_score  

def test_loop(valid_dataloader, model, loss_fn):  
    loss = 0  
    dataloader_size = len(valid_dataloader)  
    
    # Initialize lists to hold the true labels and predictions  
    y_true = []  
    y_pred = []  
    
    model.eval()  
    
    with torch.no_grad():  
        for (X, y) in valid_dataloader:  
            X, y = X.to(device), y.to(device).float()  
            pred = model(X)  
            
            # Calculate the loss  
            loss += loss_fn(pred, y.unsqueeze(1)).item()  
            
            # Append the true labels and predicted probabilities to the lists  
            y_true.extend(y.cpu().numpy().flatten())  # Use .cpu() to move to CPU for numpy conversion  
            y_pred.extend(pred.cpu().numpy().flatten())  # Assuming pred contains probabilities  
            
    # Calculate the AUC  
    auc = roc_auc_score(y_true, y_pred)  
    loss /= dataloader_size  
    
    return loss, auc  

In [15]:
epoch_num = 3
best_auc = 0
for epoch in range(epoch_num):
    train_loss = train_loop(train_dataloader, model, loss_fn, optimizer)
    valid_loss, valid_auc = test_loop(valid_dataloader, model, loss_fn)
    
    print(f"epoch: {epoch+1}, train loss: {train_loss}, valid_loss: {valid_loss}, valid_auc: {valid_auc}")
    
    if valid_auc > best_auc:
        torch.save(model, f'epoch{epoch+1}_valid_loss_{valid_loss}_auc_{valid_auc}_model.bin')
        
    torch.cuda.empty_cache() 

epoch: 1, train loss: 0.2029484167142845, valid_loss: 0.12448973938201864, valid_auc: 0.9861787233335548
epoch: 2, train loss: 0.10623879123502172, valid_loss: 0.11764985185027832, valid_auc: 0.9871818302651691
epoch: 3, train loss: 0.07443940020417966, valid_loss: 0.22690615657290134, valid_auc: 0.9789139861766465
