In [1]:
!pip uninstall -y pyarrow datasets
!pip install --no-use-pep517 pyarrow
!pip install datasets
!pip install torch transformers
!pip install transformers[torch]
!pip install accelerate -U
!pip install scikit-learn

Found existing installation: pyarrow 16.0.0
Uninstalling pyarrow-16.0.0:
  Successfully uninstalled pyarrow-16.0.0
Found existing installation: datasets 2.19.0
Uninstalling datasets-2.19.0:
  Successfully uninstalled datasets-2.19.0
Collecting pyarrow
  Using cached pyarrow-16.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Using cached pyarrow-16.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (40.8 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-16.0.0
Collecting datasets
  Using cached datasets-2.19.0-py3-none-any.whl.metadata (19 kB)
Using cached datasets-2.19.0-py3-none-any.whl (542 kB)
Installing collected packages: datasets
Successfully installed datasets-2.19.0


In [2]:
from datasets import load_dataset

dataset = load_dataset("Hello-SimpleAI/HC3", name="all")

In [3]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'human_answers', 'chatgpt_answers', 'source'],
        num_rows: 24322
    })
})


In [4]:
human_answers = dataset['train']['human_answers']
chatgpt_answers = dataset['train']['chatgpt_answers']

In [5]:
print(human_answers[:5])
print(type(human_answers))
print(chatgpt_answers[:5])

[['Basically there are many categories of " Best Seller " . Replace " Best Seller " by something like " Oscars " and every " best seller " book is basically an " oscar - winning " book . May not have won the " Best film " , but even if you won the best director or best script , you \'re still an " oscar - winning " film . Same thing for best sellers . Also , IIRC the rankings change every week or something like that . Some you might not be best seller one week , but you may be the next week . I guess even if you do n\'t stay there for long , you still achieved the status . Hence , # 1 best seller .', "If you 're hearing about it , it 's because it was a very good or very well - publicized book ( or both ) , and almost every good or well - publicized book will be # 1 on the NY Times bestseller list for at least a little bit . Kindof like how almost every big or good movies are # 1 at the box office on their opening weekend .", "One reason is lots of catagories . However , how the NY Tim

In [6]:
from transformers import DistilBertTokenizer
from datasets import Dataset, DatasetDict

# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Safely flatten the nested list structure in human_answers and chatgpt_answers
# Check if the list is non-empty and then take the first element
human_answers = [string for sublist in dataset['train']['human_answers'] for string in sublist if sublist]
chatgpt_answers = [string for sublist in dataset['train']['chatgpt_answers'] for string in sublist if sublist]

# human_answers = human_answers[:500]
# chatgpt_answers = chatgpt_answers[:500]

# Create a Hugging Face Dataset from the data (if not already in one)
data_dict = {
    "texts": human_answers + chatgpt_answers,
    "labels": [0] * len(human_answers) + [1] * len(chatgpt_answers)
}
hf_dataset = Dataset.from_dict(data_dict)

# Define the preprocessing function to tokenize the data
def preprocess_function(examples):
    # Tokenize the texts
    tokenized_inputs = tokenizer(examples['texts'], padding="max_length", truncation=True, max_length=512)
    
    # Prepare the dictionary correctly.
    return {
        'input_ids': tokenized_inputs['input_ids'], 
        'attention_mask': tokenized_inputs['attention_mask'], 
        'labels': examples['labels']
    }

# Apply the preprocessing function to the dataset
tokenized_datasets = hf_dataset.map(preprocess_function, batched=True)

# Display the first few processed entries to verify
print(tokenized_datasets.select(range(2)))  # Select the first two entries for display

Map:   0%|          | 0/85449 [00:00<?, ? examples/s]

Dataset({
    features: ['texts', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 2
})


In [7]:
!nvidia-smi

Tue Apr 23 16:59:11 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:AF:00.0 Off |                    0 |
| N/A   48C    P8              16W /  70W |      2MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [24]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import torch
import torch.cuda

print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())

# Split the dataset
train_test_split = tokenized_datasets.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_val_dataset = train_test_split['test']

test_val_split = test_val_dataset.train_test_split(test_size=0.5, seed=42)
eval_dataset = test_val_split['train']
test_dataset = test_val_split['test']

# Load the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
print(model.device)

# Check for GPU availability and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(model.device)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100
)

# Define the metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='binary')
    return {
        'accuracy': acc,
        'f1_score': f1,
    }

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()

# Save the trained model
torch.save(model.state_dict(), 'model.pth')

True
1
0


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cpu
cuda:0


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [22]:
torch.save(model.state_dict(), 'model.pth')

In [25]:
# Load the saved model state dictionary
model_path = 'model.pth'
loaded_state_dict = torch.load(model_path)
# Create a new instance of the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Load the state dictionary into the model
model.load_state_dict(loaded_state_dict)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [27]:
# Set the model to evaluation mode
model.eval()

# Move the model to the appropriate device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Evaluate the model on the test dataset
predictions = trainer.predict(test_dataset)
test_metrics = compute_metrics(predictions)
print("Test Accuracy:", test_metrics['accuracy'])
print("Test F1 Score:", test_metrics['f1_score'])

Test Accuracy: 0.9225277940315975
Test F1 Score: 0.8708040593286495


In [23]:
import torch
from torch.nn.functional import softmax

# Check for GPU availability and set the device accordingly
device = torch.device("cpu" if torch.cuda.is_available() else "cuda")
model.to(device)  # Move the model to the appropriate device

# Function to make a prediction on a single sentence
def predict(sentence):
    # Tokenize the sentence so it matches the format expected by the model
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Predict
    with torch.no_grad():  # Disable gradient calculation to speed up the process and reduce memory usage
        logits = model(**inputs).logits

    # Apply softmax to logits to get probabilities
    probabilities = softmax(logits, dim=1)

    # Assuming we have two classes, 0 and 1, and class 0 is the 'negative' class
    prediction = probabilities.argmax().item()  # Get the index of the highest probability
    return {"class": prediction, "probabilities": probabilities.tolist()[0]}

# Example usage
user_sentence = "Yes, it is possible to be subject to a cash withdrawal even if you do not use an ATM. There are several ways that this could happen:Debit card transactions: If you make a purchase using your debit card, the merchant may automatically withdraw the amount of the purchase from your checking account. This is essentially the same as making a cash withdrawal.Bank fees: Some banks charge fees for maintaining an account or for using certain services. These fees may be automatically withdrawn from your account on a regular basis.Automatic payments: If you have set up automatic payments for bills or other expenses, the amount of the payment will be withdrawn from your account when it is due.Check payments: If you write a check to pay for something, the recipient may deposit the check and withdraw the funds from your account.Electronic transfers: You may also be subject to a cash withdrawal if you authorize an electronic transfer of funds from your account to another account.In summary, there are many ways that you could be subject to a cash withdrawal even if you do not use an ATM. It is important to carefully track your account balance and be aware of any automatic transactions or payments that may be taking place."
result = predict(user_sentence)
print("Predicted Class:", result["class"])
print("Probabilities:", result["probabilities"])

Predicted Class: 1
Probabilities: [0.0012614225270226598, 0.9987385869026184]
