In [1]:
# Source:
# https://huggingface.co/datasets/aadityaubhat/GPT-wiki-intro/viewer/default/train

In [2]:
# Upgrade pip
!pip install --upgrade pip

# Install required packages
!pip install datasets transformers torch scikit-learn accelerate

# If you specifically need the 'torch' extras from transformers
!pip install transformers[torch] --upgrade

Collecting pip
  Downloading pip-23.3.2-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-23.3.2
Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl.metadata (18 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading accelerate-0.26.

In [4]:
from datasets import load_dataset
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, DistilBertConfig, DistilBertTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
import os

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Define paths
base_path = "/content/drive/My Drive/1-1i01FVxECPT9vpfu6Ye5nEpz3qw1jnO/MyModel"
checkpoint_path = f"{base_path}/checkpoints"
trained_model_path = f"{base_path}/trained_model"
logs_path = f"{base_path}/logs"
csv_path = f"{base_path}/wiki_data.csv"

# Create directories if they don't exist
os.makedirs(base_path, exist_ok=True)
os.makedirs(checkpoint_path, exist_ok=True)
os.makedirs(trained_model_path, exist_ok=True)
os.makedirs(logs_path, exist_ok=True)

In [7]:
# Load data from directory on huggingface.co
dataset = load_dataset("aadityaubhat/GPT-wiki-intro")

# Combine all splits into one DataFrame
df = pd.concat([dataset[split].to_pandas() for split in dataset.keys()])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
# Save the DataFrame as a CSV file in the specified directory on Google Drive
df.to_csv(csv_path, index=False)

# Load the data into a Pandas DataFrame
df = pd.read_csv(csv_path)

In [9]:
# Separate 10% of the data as test data
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

# Splitting the training DataFrame into 3 equal parts
split_size = len(train_df) // 3
df1, df2, df3 = train_df.iloc[:split_size], train_df.iloc[split_size:2*split_size], train_df.iloc[2*split_size:]

In [10]:
# Prepare the texts and labels for the test data
test_texts = test_df['wiki_intro'].tolist() + test_df['generated_intro'].tolist()
test_labels = [0] * len(test_df['wiki_intro']) + [1] * len(test_df['generated_intro'])

# For df1
train_labels_df1 = [0] * len(df1['wiki_intro']) + [1] * len(df1['generated_intro'])
train_texts_df1 = df1['wiki_intro'].tolist() + df1['generated_intro'].tolist()

# For df2
train_labels_df2 = [0] * len(df2['wiki_intro']) + [1] * len(df2['generated_intro'])
train_texts_df2 = df2['wiki_intro'].tolist() + df2['generated_intro'].tolist()

# For df3
train_labels_df3 = [0] * len(df3['wiki_intro']) + [1] * len(df3['generated_intro'])
train_texts_df3 = df3['wiki_intro'].tolist() + df3['generated_intro'].tolist()


In [11]:
# Use a small, fast model for quick training (DistilBERT)
model_name = "distilbert-base-uncased"
config = DistilBertConfig.from_pretrained(model_name)
config.num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(texts, truncation=True, padding=True)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Preparing train datasets for each subset
train_dataset_df1 = TextDataset(train_texts_df1, train_labels_df1, tokenizer)
train_dataset_df2 = TextDataset(train_texts_df2, train_labels_df2, tokenizer)
train_dataset_df3 = TextDataset(train_texts_df3, train_labels_df3, tokenizer)

# Preparing the validation dataset (assuming you have separate validation data)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

# Preparing the test dataset
test_dataset = TextDataset(test_texts, test_labels, tokenizer)

In [None]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir=checkpoint_path,
    num_train_epochs=1,  # We'll manually loop over epochs
    per_device_train_batch_size=16,  # Adjust based on your GPU memory
    gradient_accumulation_steps=2,  # Increase if using a larger effective batch size
    evaluation_strategy="epoch",
    logging_dir=logs_path,
    logging_steps=50,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
# Function to initialize the Trainer with a specific training dataset
def initialize_trainer_for_subset(train_dataset, eval_dataset, model, training_args):
    return Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

# Define the number of manual epochs
num_manual_epochs = 3  # Set the number of manual epochs

# Looping over the manual epochs
for epoch in range(num_manual_epochs):
    print(f"Starting manual epoch {epoch + 1}/{num_manual_epochs}")

    # Iterate over each training subset within each manual epoch
    for subset_index, train_dataset in enumerate([train_dataset_df1, train_dataset_df2, train_dataset_df3]):
        print(f"Training on subset {subset_index + 1}")

        # Initialize the Trainer with the current training subset
        trainer = initialize_trainer_for_subset(train_dataset, val_dataset, model, training_args)

        # Train the model for one epoch on the current subset
        trainer.train()

        # Optionally, evaluate the model after each subset
        # results = trainer.evaluate()

    # After completing all subsets for the manual epoch, you can save the model
    # print(f"Saving model after manual epoch {epoch + 1}")
    # trainer.save_model(f"{trained_model_path}/manual_epoch_{epoch + 1}")
    # tokenizer.save_pretrained(f"{trained_model_path}/manual_epoch_{epoch + 1}")

In [None]:
#Evaluatoin / Testing

# Create the test dataset
test_dataset = TextDataset(test_texts, test_labels, tokenizer)

# Function to initialize the Trainer for evaluation
def initialize_trainer_for_evaluation(model, training_args, eval_dataset):
    return Trainer(
        model=model,
        args=training_args,
        eval_dataset=eval_dataset,
    )

# Initialize the Trainer for evaluation with the test dataset
eval_trainer = initialize_trainer_for_evaluation(model, training_args, test_dataset)

# Evaluate the model on the test set
results = eval_trainer.evaluate()

print("Training completed successfully.")
print("Evaluation results:", results)