In [1]:
from datasets import load_dataset, Dataset, concatenate_datasets

In [2]:
train_essays = load_dataset('csv', data_files = '/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
test_essays = load_dataset('csv', data_files = '/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
train_prompts = load_dataset('csv', data_files = '/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv')

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [3]:
train_essays, test_essays, train_prompts

(DatasetDict({
     train: Dataset({
         features: ['id', 'prompt_id', 'text', 'generated'],
         num_rows: 1378
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['id', 'prompt_id', 'text'],
         num_rows: 3
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['prompt_id', 'prompt_name', 'instructions', 'source_text'],
         num_rows: 2
     })
 }))

In [4]:
train_test_split = train_essays['train'].train_test_split(test_size = 0.2)

In [5]:
train = train_test_split['train'].rename_column('generated' , 'label')
val = train_test_split['test'].rename_column('generated' , 'label')
test = test_essays['train']

In [6]:
train , val, test

(Dataset({
     features: ['id', 'prompt_id', 'text', 'label'],
     num_rows: 1102
 }),
 Dataset({
     features: ['id', 'prompt_id', 'text', 'label'],
     num_rows: 276
 }),
 Dataset({
     features: ['id', 'prompt_id', 'text'],
     num_rows: 3
 }))

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

model_ckpt = '/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1'

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.bfloat16
)

def get_new_model():
    return AutoModelForCausalLM.from_pretrained(model_ckpt, quantization_config = bnb_config, low_cpu_mem_usage = True)

In [8]:
prompts = ['<prompt> ' + ' '.join(text.split()[:-7]) for text in train_prompts['train']['instructions']]

In [9]:
texts = [' '.join(text.split()[:10]) for text in train['text']]

In [10]:
def make_prompts(prompts, train_data):
    return  [prompts[prompt_id] + ' <response> ' + text  for prompt_id, text in zip(train_data['prompt_id'], texts)]
      
    
input_texts = Dataset.from_dict({'text' : make_prompts(prompts, train)})

In [11]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

def preprocess(batch):
    return tokenizer(batch['text'], padding = 'max_length', truncation = True, return_tensors = 'pt', max_length = 128) 

inputs = input_texts.map(preprocess, batched = True, batch_size = 16)

Map:   0%|          | 0/1102 [00:00<?, ? examples/s]

In [12]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [13]:
model = get_new_model()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [None]:
%%time

generated_texts = []
batch_size = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
for i in range(0, len(inputs), batch_size):
    start = i; end = min(i + batch_size, len(inputs))
    batch = inputs[start:end]
    
    input_ids =  torch.tensor(batch['input_ids']).to(device)
    attention_mask =  torch.tensor(batch['attention_mask']).to(device)
     
    with torch.no_grad():
        outputs = model.generate(
            input_ids = input_ids,
            attention_mask = attention_mask,
            max_new_tokens = 1000,
            top_p = 0.9,
            top_k = 50,
            temperature = 0.7,
            repetition_penalty = 1.2,
            do_sample = True
        )
    generated_texts.extend([tokenizer.decode(output, skip_sepcial_tokens = True) for output in outputs])
    
    print(f'batch {i // batch_size + 1} generation completed')
    break

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [None]:
cleaned_texts = [text.replace(tokenizer.pad_token, '').split('<response>')[1].strip() for text in generated_texts]

cleaned_texts

In [None]:
lens = [len(text.split()) for text in cleaned_texts]

lens

In [None]:
gen_data = Dataset.from_dict({'text' : cleaned_texts, 'label' : [1 for _ in range(len(cleaned_texts))]})
train_ = train.remove_columns(['id','prompt_id'])
train_data = concatenate_datasets([train_, gen_data])

In [None]:
import pandas as pd

pd.DataFrame(train_data).to_csv('train_data.csv', index = False)
# train_data

In [None]:
torch.cuda.empty_cache()

In [None]:
import pandas as pd

train_data = Dataset.from_pandas(pd.read_csv('/kaggle/working/train_data.csv'))

In [None]:
from transformers import Gemma2ForSequenceClassification, GemmaTokenizerFast,
import torch

gemma_model_ckpt = '/kaggle/input/gemma-2/transformers/gemma-2-2b/1'
gemma_tokenizer = GemmaTokenizerFast.from_pretrained(gemma_model_ckpt)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.bfloat16
)

def get_classifier():
    return Gemma2ForSequenceClassification.from_pretrained(gemma_model_ckpt, num_labels = 2, quantization_config = bnb_config, low_cpu_mem_usage = True)

In [None]:
def tokenize_data(batch):
    return gemma_tokenizer(batch['text'], padding = True, truncation = True, max_length = 512)

train_ds = train_data.map(tokenize_data, batched = True)
val_ds = val.map(tokenize_data, batched = True)
test_ds = test.map(tokenize_data, batched = True)

In [None]:
train_ds, val_ds, test_ds

In [None]:
import os

os.environ['WANDB_DISABLED'] = 'true'

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
from sklearn.metrics import roc_auc_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds)
    return {'f1' : f1}

In [None]:
per_device_bs =  2
output_dir = '/kaggle/working/gemma2'

args = TrainingArguments(
    output_dir = output_dir,
    num_train_epochs = 5,
    learning_rate = 1e-5,
    per_device_train_batch_size = per_device_bs,
    per_device_eval_batch_size = per_device_bs,
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    logging_steps = 100,
    load_best_model_at_end = True,
    overwrite_output_dir = True,
)

In [None]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [None]:
classifier = get_classifier()

In [None]:
trainer = Trainer(
    model = classifier,
    args = args,
    train_dataset = train_ds,
    eval_dataset = val_ds,
    compute_metrics = compute_metrics,
    tokenizer = bert_tokenizer
)

trainer.train()

In [None]:
preds = trainer.predict(test_ds)

preds

In [None]:
x = torch.sigmoid(torch.tensor(preds.predictions))

x

In [None]:
import pandas as pd

sub = pd.DataFrame({
    'id' : test_['train']['id'],
    'generated' : x[:, 0]
})

sub.to_csv('submission.csv', index = False)