In [2]:
!pip install -q transformers datasets evaluate accelerate protobuf

In [3]:
pip install -U datasets

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
from datasets import Dataset
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
import os

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [6]:
ds = load_dataset("Hello-SimpleAI/HC3", "all")

In [7]:
data = []

for item in ds["train"]:
    # Label 0 for human answers
    for h_ans in item["human_answers"]:
        data.append({
            "text": h_ans,
            "label": 0
        })

    # Label 1 for AI (ChatGPT) answers
    for ai_ans in item["chatgpt_answers"]:
        data.append({
            "text": ai_ans,
            "label": 1
        })

df = pd.DataFrame(data)

In [8]:
df['text'] = df['text'].str.replace('\n', ' ', regex=True)
print(df.head())

                                                text  label
0  Basically there are many categories of " Best ...      0
1  If you 're hearing about it , it 's because it...      0
2  One reason is lots of catagories . However , h...      0
3  There are many different best seller lists tha...      1
4  salt is good for not dying in car crashes and ...      0


In [9]:
# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into 90% train, 10% test
split_idx = int(len(df) * 0.9)
df_train = df.iloc[:split_idx]
df_test = df.iloc[split_idx:]

# Reduce training set to 1/20th
df_train = df_train.sample(frac=1/20, random_state=42).reset_index(drop=True)
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [10]:
model_name = "microsoft/deberta-v3-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(example):
    return tokenizer(example["text"], truncation=True)



In [11]:
train_dataset = train_dataset.map(preprocess, batched=True)
test_dataset = test_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/3845 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 3845/3845 [00:00<00:00, 7334.68 examples/s]
Map: 100%|██████████| 8545/8545 [00:01<00:00, 6465.55 examples/s]


In [12]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 2)
model.gradient_checkpointing_enable()

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred 
    preds = np.argmax(logits, axis =-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references = labels)["accuracy"],
        "f1":f1.compute(predictions=preds, references=labels)["f1"]
    }

In [14]:
import transformers

In [15]:
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=4,
    weight_decay=0.01,
    warmup_ratio=0.1,
    load_best_model_at_end=True,
    save_total_limit=1,
    fp16=True,
    eval_accumulation_steps=4,
)


In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

# save best model and tokenizer, alr done but just in case
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.055678,0.985606,0.977778
2,No log,0.0449,0.989116,0.983167
3,0.129800,0.069501,0.987127,0.980152
4,0.129800,0.073848,0.986542,0.979268




NameError: name 'output_dir' is not defined

In [17]:
trainer.save_model('./results/deberta')
tokenizer.save_pretrained('./results/deberta')

('./results/deberta/tokenizer_config.json',
 './results/deberta/special_tokens_map.json',
 './results/deberta/spm.model',
 './results/deberta/added_tokens.json',
 './results/deberta/tokenizer.json')

In [None]:
trainer.evaluate()

