**I interrupted the code execution, because I decided to take only prompt 1 seed 1 configuration**

In [3]:
import random
import os
import json
import numpy as np
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoConfig, DataCollatorWithPadding, AdamW

In [4]:
def set_seed(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)

In [5]:
def tokenize_function(examples, tokenizer, max_length):
    return tokenizer(examples["text"], truncation=True, max_length=max_length)

In [6]:
def evaluate(dataloader, model):
    ground_truth = []
    preds = []

    model.eval()
    for batch in dataloader:
        labels = batch.pop("labels")
        labels = labels.detach().cpu().numpy()

        with torch.no_grad():
            outputs = model(**batch)
            logits = outputs.logits
            pred = torch.argmax(logits, dim=-1).detach().cpu().numpy()

        ground_truth.extend(labels.tolist())
        preds.extend(pred.tolist())

    acc = accuracy_score(ground_truth, preds)
    return acc

In [7]:
def train(train_dataloader, val_dataloader, model, optimizer, epochs, save_dir):
    max_acc = 0
    for epoch in range(epochs):
        model.train()
        for batch in train_dataloader:
            outputs = model(**batch, output_hidden_states=True)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        acc = evaluate(val_dataloader, model)
        if acc > max_acc:
            print(f'Validation Accuracy: from {max_acc} to {acc}')
            max_acc = acc
            model.save_pretrained(save_dir)
            tokenizer.save_pretrained(save_dir)

In [8]:
set_seed(1234)
model_checkpoint = "roberta-base"
max_length = 512
epochs = 5
batch_size = 8
lr = 5e-5

In [9]:
for prompt in [1, 2]:
    for seed in [1, 2]:
        train_path = f'prompt{prompt}_seed{seed}_train.csv'
        val_path = f'prompt{prompt}_seed{seed}_val.csv'
        test_path = f'prompt{prompt}_seed{seed}_test.csv'
        save_dir = f"roberta-base_prompt{prompt}_seed{seed}"

        config = AutoConfig.from_pretrained(
            model_checkpoint,
            label2id={'human': 0, 'chatgpt': 1},
            id2label={0: 'human', 1: 'chatgpt'}
        )

        tokenizer = RobertaTokenizer.from_pretrained(
            model_checkpoint,
            padding=True,
            truncation=True,
            model_max_length=max_length
        )

        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

        raw_datasets = load_dataset("csv", data_files={"train": train_path, "val": val_path, "test": test_path})

        tokenized_datasets = raw_datasets.map(
            lambda x: tokenize_function(x, tokenizer, max_length),
            batched=True,
            remove_columns=['text']
        )

        model = RobertaForSequenceClassification.from_pretrained(
            model_checkpoint, config=config
        )

        optimizer = AdamW(model.parameters(), lr=lr)

        train_dataloader = DataLoader(
            tokenized_datasets["train"],
            batch_size=batch_size,
            collate_fn=data_collator,
            shuffle=True
        )

        val_dataloader = DataLoader(
            tokenized_datasets["val"],
            batch_size=batch_size,
            collate_fn=data_collator
        )

        test_dataloader = DataLoader(
            tokenized_datasets["test"],
            batch_size=batch_size,
            collate_fn=data_collator
        )

        train(train_dataloader, val_dataloader, model, optimizer, epochs, save_dir)

        model = RobertaForSequenceClassification.from_pretrained(
            save_dir, config=config
        )

        test_acc = evaluate(test_dataloader, model)
        print(f'Test Accuracy: {test_acc}')

        ground_truth, preds = [], []
        model.eval()
        for batch in test_dataloader:
            labels = batch.pop("labels")
            labels = labels.detach().cpu().numpy()

            with torch.no_grad():
                outputs = model(**batch)
                logits = outputs.logits
                pred = torch.argmax(logits, dim=-1).detach().cpu().numpy()

            ground_truth.extend(labels.tolist())
            preds.extend(pred.tolist())

        print("Unique labels in ground truth:", np.unique(ground_truth))
        print("Unique labels in predictions:", np.unique(preds))

        print(classification_report(ground_truth, preds, digits=3))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/3080 [00:00<?, ? examples/s]

Map:   0%|          | 0/440 [00:00<?, ? examples/s]

Map:   0%|          | 0/880 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Validation Accuracy: from 0 to 0.9477272727272728
Validation Accuracy: from 0.9477272727272728 to 0.9818181818181818
Validation Accuracy: from 0.9818181818181818 to 0.9863636363636363
Test Accuracy: 0.9920454545454546
Unique labels in ground truth: [0 1]
Unique labels in predictions: [0 1]
              precision    recall  f1-score   support

           0      0.989     0.995     0.992       440
           1      0.995     0.989     0.992       440

    accuracy                          0.992       880
   macro avg      0.992     0.992     0.992       880
weighted avg      0.992     0.992     0.992       880



Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/3080 [00:00<?, ? examples/s]

Map:   0%|          | 0/440 [00:00<?, ? examples/s]

Map:   0%|          | 0/880 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Validation Accuracy: from 0 to 0.9068181818181819
Validation Accuracy: from 0.9068181818181819 to 0.9863636363636363
Validation Accuracy: from 0.9863636363636363 to 0.9977272727272727
Test Accuracy: 0.9954545454545455
Unique labels in ground truth: [0 1]
Unique labels in predictions: [0 1]
              precision    recall  f1-score   support

           0      1.000     0.991     0.995       440
           1      0.991     1.000     0.995       440

    accuracy                          0.995       880
   macro avg      0.995     0.995     0.995       880
weighted avg      0.995     0.995     0.995       880



Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/3080 [00:00<?, ? examples/s]

Map:   0%|          | 0/440 [00:00<?, ? examples/s]

Map:   0%|          | 0/880 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 