<a href="https://colab.research.google.com/github/Di9mar/ada4b/blob/main/text%20classification%20initial%20run.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Source:
# https://huggingface.co/datasets/aadityaubhat/GPT-wiki-intro/viewer/default/train

In [2]:
# Upgrade pip
!pip install --upgrade pip

# Install required packages
!pip install datasets transformers torch scikit-learn accelerate

# If you specifically need the 'torch' extras from transformers
!pip install transformers[torch] --upgrade

[0m

In [3]:
from datasets import load_dataset
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, DistilBertConfig, DistilBertTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
import os

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Define paths
base_path = "/content/drive/My Drive/ColabData/MyModel"
checkpoint_path = f"{base_path}/checkpoints"
trained_model_path = f"{base_path}/trained_model"
logs_path = f"{base_path}/logs"
csv_path = f"{base_path}/wiki_data.csv"

# Create directories if they don't exist
os.makedirs(base_path, exist_ok=True)
os.makedirs(checkpoint_path, exist_ok=True)
os.makedirs(trained_model_path, exist_ok=True)
os.makedirs(logs_path, exist_ok=True)

In [6]:
# Load data from directory on huggingface.co
dataset = load_dataset("aadityaubhat/GPT-wiki-intro")

# Combine all splits into one DataFrame
df = pd.concat([dataset[split].to_pandas() for split in dataset.keys()])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
# Save the DataFrame as a CSV file in the specified directory on Google Drive
df.to_csv(csv_path, index=False)

# Load the data into a Pandas DataFrame
df = pd.read_csv(csv_path)

In [8]:
# Use only a fraction of the data for faster training iterations
fraction = 0.02  # Adjust this to use, e.g., 2% of the data

# Calculate the number of samples to include for each class
num_human_samples = int(len(df) * fraction)
num_ai_samples = int(len(df) * fraction)

# Create a balanced dataset with an equal number of human and AI-generated samples
balanced_texts = df['wiki_intro'].tolist()[:num_human_samples] + df['generated_intro'].tolist()[:num_ai_samples]
balanced_labels = [0] * num_human_samples + [1] * num_ai_samples

# Split the balanced dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(balanced_texts, balanced_labels, test_size=0.1)

# Use a small, fast model for quick training (DistilBERT)
model_name = "distilbert-base-uncased"
config = DistilBertConfig.from_pretrained(model_name)
config.num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
# Define dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(texts, truncation=True, padding=True)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Preparing train data
train_dataset = TextDataset(train_texts, train_labels, tokenizer)

# Preparing validation data
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

In [10]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=checkpoint_path,
    num_train_epochs=1,  # We'll manually loop over epochs
    per_device_train_batch_size=16,  # Adjust based on your GPU memory
    gradient_accumulation_steps=2,  # Increase if using a larger effective batch size
    evaluation_strategy="epoch",
    logging_dir=logs_path,
    logging_steps=50,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model for one epoch
trainer.train()

# Save the model and tokenizer at the end of training
trainer.save_model(trained_model_path)
tokenizer.save_pretrained(trained_model_path)

Epoch,Training Loss,Validation Loss


In [None]:
# Evaluate the model
results = trainer.evaluate()

print("Training completed successfully.")
print("Evaluation results:", results)