In [None]:
!pip uninstall -y pyarrow datasets
!pip install --no-use-pep517 pyarrow
!pip install datasets
!pip install torch transformers
!pip install transformers[torch]
!pip install accelerate -U

In [None]:
from datasets import load_dataset

dataset = load_dataset("Hello-SimpleAI/HC3", name="all")

In [None]:
print(dataset)

In [None]:
human_answers = dataset['train']['human_answers']
chatgpt_answers = dataset['train']['chatgpt_answers']

In [None]:
print(human_answers[:5])
print(type(human_answers))
print(chatgpt_answers[:5])

In [None]:
from transformers import DistilBertTokenizer
from datasets import Dataset, DatasetDict

# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Safely flatten the nested list structure in human_answers and chatgpt_answers
# Check if the list is non-empty and then take the first element
human_answers = [item[0] if item else "Empty response" for item in dataset['train']['human_answers']]
chatgpt_answers = [item[0] if item else "Empty response" for item in dataset['train']['chatgpt_answers']]

# Create a Hugging Face Dataset from the data (if not already in one)
data_dict = {
    "texts": human_answers + chatgpt_answers,
    "labels": [0] * len(human_answers) + [1] * len(chatgpt_answers)
}
hf_dataset = Dataset.from_dict(data_dict)

# Define the preprocessing function to tokenize the data
def preprocess_function(examples):
    # Tokenize the texts
    tokenized_inputs = tokenizer(examples['texts'], padding="max_length", truncation=True, max_length=512)
    
    # Prepare the dictionary correctly.
    return {
        'input_ids': tokenized_inputs['input_ids'], 
        'attention_mask': tokenized_inputs['attention_mask'], 
        'labels': examples['labels']
    }

# Apply the preprocessing function to the dataset
tokenized_datasets = hf_dataset.map(preprocess_function, batched=True)

# Display the first few processed entries to verify
print(tokenized_datasets.select(range(2)))  # Select the first two entries for display

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer

# Split the dataset
train_test_split = tokenized_datasets.train_test_split(test_size=0.1, seed=42)  # Splitting the dataset
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Load the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Start training
trainer.train()