In [1]:
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
from transformers import AutoModel, AutoTokenizer

In [2]:
input_path = '/kaggle/input/llm-detect-ai-generated-text'
test_data = pd.read_csv(f'{input_path}/test_essays.csv')
submission = pd.read_csv(f'{input_path}/sample_submission.csv')

In [3]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class ClassifierModel(nn.Module):
    def __init__(self, checkpoint):
        super(ClassifierModel, self).__init__()
        self.bert_model = AutoModel.from_pretrained(checkpoint)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 1)
        self.pool = MeanPooling()
        
    def encode(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert_model(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_hidden_states=False
        )

        encoder_layer = outputs.last_hidden_state
        embeddings = self.pool(encoder_layer, attention_mask)

        return embeddings
        
    def forward(self, x):
        bert_output = self.encode(**x)
        x = self.dropout(bert_output)
        x = self.classifier(x)
        return x

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
model_path = '/kaggle/input/detect-ai-generated-text-deberta-model'
tokenizer = AutoTokenizer.from_pretrained(f'{model_path}/bert-tokenizer')
model = torch.load(f'{model_path}/epoch2_valid_loss_0.11764985185027832_auc_0.9871818302651691_model.bin',
                  map_location=torch.device(device))

  model = torch.load(f'{model_path}/epoch2_valid_loss_0.11764985185027832_auc_0.9871818302651691_model.bin',


In [6]:
class EssayDataSet(Dataset):
    def __init__(self, data_path):
        self.data = pd.read_csv(data_path)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data.iloc[idx]

test_data = EssayDataSet(f'/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')


def collate_fn(batch_samples):
    return tokenizer([batch_sample['text'] for batch_sample in batch_samples], padding=True,truncation=True, max_length=512,
                 return_tensors="pt")

test_dataloader = DataLoader(test_data, batch_size=16, shuffle=False, collate_fn=collate_fn)

In [7]:
model.eval()

y_pred = []
with torch.no_grad():
    for X in test_dataloader:
        pred = model(X)
        y_pred.extend(F.sigmoid(pred).cpu().numpy().flatten())

In [8]:
submission['generated'] = y_pred
submission.to_csv('submission.csv', index=False)
submission

Unnamed: 0,id,generated
0,0000aaaa,0.999731
1,1111bbbb,0.999712
2,2222cccc,0.999741
