In [25]:
!pip uninstall -y pyarrow datasets
!pip install --no-use-pep517 pyarrow
!pip install datasets
!pip install torch transformers
!pip install transformers[torch]
!pip install accelerate -U
!pip install scikit-learn
!pip install google-api-python-client
!pip install google

Found existing installation: pyarrow 16.0.0
Uninstalling pyarrow-16.0.0:
  Successfully uninstalled pyarrow-16.0.0
Found existing installation: datasets 2.19.0
Uninstalling datasets-2.19.0:
  Successfully uninstalled datasets-2.19.0
Collecting pyarrow
  Using cached pyarrow-16.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Using cached pyarrow-16.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (40.8 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-16.0.0
Collecting datasets
  Using cached datasets-2.19.0-py3-none-any.whl.metadata (19 kB)
Using cached datasets-2.19.0-py3-none-any.whl (542 kB)
Installing collected packages: datasets
Successfully installed datasets-2.19.0
Collecting google-api-python-client
  Downloading google_api_python_client-2.127.0-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting httplib2<1.dev0,>=0.19.0 (from google-api-python-client)
  Downloading httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting google-auth-httplib2<1.0

In [4]:
import os

import pandas as pd
import s3fs
import zipfile

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
import torch
import torch.cuda
from sklearn.metrics import accuracy_score, f1_score

from datasets import Dataset, DatasetDict, load_dataset

from torch.nn.functional import softmax

## Récupérer les données d'un challenge

In [5]:
dataset = load_dataset("Hello-SimpleAI/HC3", name="all")

# Create lists from the dataset
human_answers = [string for sublist in dataset['train']['human_answers'] for string in sublist if sublist]
chatgpt_answers = [string for sublist in dataset['train']['chatgpt_answers'] for string in sublist if sublist]

## Model

## Create a new model from scratch

In [6]:
# Create a new model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
print(model.device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cpu


## Load the saved model

In [36]:
# Load the saved model state dictionary
model_path = 'model_distilbert_dataset1.pth'
loaded_state_dict = torch.load(model_path)
# Create a new instance of the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Load the state dictionary into the model
model.load_state_dict(loaded_state_dict)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

# Training and testing

In [7]:
# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Create a Dataset from the lists
data_dict = {
    "texts": human_answers + chatgpt_answers,
    "labels": [0] * len(human_answers) + [1] * len(chatgpt_answers)
}
hf_dataset = Dataset.from_dict(data_dict)

# Define the preprocessing function to tokenize the data
def preprocess_function(examples):
    # Tokenize the texts
    tokenized_inputs = tokenizer(examples['texts'], padding="max_length", truncation=True, max_length=512)
    
    # Prepare the dictionary correctly.
    return {
        'input_ids': tokenized_inputs['input_ids'], 
        'attention_mask': tokenized_inputs['attention_mask'], 
        'labels': examples['labels']
    }

# Apply the preprocessing function to the dataset
tokenized_datasets = hf_dataset.map(preprocess_function, batched=True)

# Display the first few processed entries to verify
print(tokenized_datasets.select(range(2)))  # Select the first two entries for display

Map:   0%|          | 0/85449 [00:00<?, ? examples/s]

Dataset({
    features: ['texts', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 2
})


In [8]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())

# Split the dataset
train_test_split = tokenized_datasets.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_val_dataset = train_test_split['test']

test_val_split = test_val_dataset.train_test_split(test_size=0.5, seed=42)
eval_dataset = test_val_split['train']
test_dataset = test_val_split['test']

# Check for GPU availability and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(model.device)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100
)

# Define the metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='binary')
    return {
        'accuracy': acc,
        'f1_score': f1,
    }

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()

True
1
0
cuda:0


Step,Training Loss,Validation Loss,Accuracy,F1 Score
100,0.3212,0.282145,0.935518,0.896681
200,0.1027,0.124904,0.962434,0.937341
300,0.0516,0.278796,0.931071,0.899915
400,0.2294,0.107889,0.960913,0.940696
500,0.0795,0.068088,0.981393,0.970484
600,0.0721,0.279493,0.948391,0.923504
700,0.201,0.069761,0.983265,0.973095
800,0.0993,0.055836,0.983733,0.973808
900,0.2046,0.24844,0.950029,0.914274
1000,0.1839,0.061227,0.98701,0.979471


KeyboardInterrupt: 

In [9]:
torch.save(model.state_dict(), 'model_distilbert_dataset1.pth')

In [10]:
# Set the model to evaluation mode
model.eval()

# Move the model to the appropriate device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
from sklearn.metrics import classification_report

# Assuming predictions have been made and are stored in 'predictions'
predictions = trainer.predict(test_dataset)

# If compute_metrics is a function you define to calculate metrics, it should return a dictionary
def compute_metrics(pred):
    y_true = pred.label_ids
    y_pred = pred.predictions.argmax(-1)
    return classification_report(y_true, y_pred, output_dict=True)

test_metrics = compute_metrics(predictions)

# Print all metrics
print("Test Metrics:")
for metric in test_metrics:
    if isinstance(test_metrics[metric], dict):  # This handles metrics which are dictionaries themselves, like precision, recall, f1-score for each class
        for sub_metric in test_metrics[metric]:
            print(f"{metric.capitalize()} {sub_metric.capitalize()}: {test_metrics[metric][sub_metric]:.4f}")
    else:
        print(f"{metric.capitalize()}: {test_metrics[metric]:.4f}")

Test Metrics:
0 Precision: 0.9979
0 Recall: 0.9880
0 F1-score: 0.9929
0 Support: 5833.0000
1 Precision: 0.9747
1 Recall: 0.9956
1 F1-score: 0.9850
1 Support: 2712.0000
Accuracy: 0.9904
Macro avg Precision: 0.9863
Macro avg Recall: 0.9918
Macro avg F1-score: 0.9890
Macro avg Support: 8545.0000
Weighted avg Precision: 0.9906
Weighted avg Recall: 0.9904
Weighted avg F1-score: 0.9904
Weighted avg Support: 8545.0000


In [None]:
# Test on an individual sentence

device = torch.device("cpu" if torch.cuda.is_available() else "cuda")
model.to(device)  # Move the model to the appropriate device

# Function to make a prediction on a single sentence
def predict(sentence):
    # Tokenize the sentence so it matches the format expected by the model
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Predict
    with torch.no_grad():  # Disable gradient calculation to speed up the process and reduce memory usage
        logits = model(**inputs).logits

    # Apply softmax to logits to get probabilities
    probabilities = softmax(logits, dim=1)

    # Assuming we have two classes, 0 and 1, and class 0 is the 'negative' class
    prediction = probabilities.argmax().item()  # Get the index of the highest probability
    return {"class": prediction, "probabilities": probabilities.tolist()[0]}

# Example usage
user_sentence = "Yes, it is possible to be subject to a cash withdrawal even if you do not use an ATM. There are several ways that this could happen:Debit card transactions: If you make a purchase using your debit card, the merchant may automatically withdraw the amount of the purchase from your checking account. This is essentially the same as making a cash withdrawal.Bank fees: Some banks charge fees for maintaining an account or for using certain services. These fees may be automatically withdrawn from your account on a regular basis.Automatic payments: If you have set up automatic payments for bills or other expenses, the amount of the payment will be withdrawn from your account when it is due.Check payments: If you write a check to pay for something, the recipient may deposit the check and withdraw the funds from your account.Electronic transfers: You may also be subject to a cash withdrawal if you authorize an electronic transfer of funds from your account to another account.In summary, there are many ways that you could be subject to a cash withdrawal even if you do not use an ATM. It is important to carefully track your account balance and be aware of any automatic transactions or payments that may be taking place."
result = predict(user_sentence)
print("Predicted Class:", result["class"])
print("Probabilities:", result["probabilities"])

Predicted Class: 0
Probabilities: [0.9999910593032837, 8.935196092352271e-06]
