<a href="https://colab.research.google.com/github/Di9mar/ada4b/blob/main/Copy_of_Wiki_Classification_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Source:
# https://huggingface.co/datasets/aadityaubhat/GPT-wiki-intro/viewer/default/train

In [2]:
# Upgrade pip
!pip install --upgrade pip

# Install required packages
!pip install datasets transformers torch scikit-learn accelerate

# If you specifically need the 'torch' extras from transformers
!pip install transformers[torch] --upgrade

Successfully installed transformers-4.36.2
[0m

In [3]:
from datasets import load_dataset
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, DistilBertConfig, TrainerCallback
from torch.utils.data import Dataset
import os
import gc
from datetime import datetime
import subprocess

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Define paths
base_path = "/content/drive/My Drive/ColabData"
checkpoint_path = f"{base_path}/checkpoints"
trained_model_path = f"{base_path}/trained_model"
logs_path = f"{base_path}/logs"
csv_path = f"{base_path}/wiki_data.csv"
subset_paths = [f"{base_path}/subset_{i}.csv" for i in range(1, 5)]  # 5 for four 22.5% training subsets and one 10% validation subset

# Create directories if they don't exist
os.makedirs(base_path, exist_ok=True)
os.makedirs(checkpoint_path, exist_ok=True)
os.makedirs(trained_model_path, exist_ok=True)
os.makedirs(logs_path, exist_ok=True)

# Check if the main CSV file exists and load or create it
if not os.path.exists(csv_path):
    # Load data from directory on huggingface.co
    dataset = load_dataset("aadityaubhat/GPT-wiki-intro")

    # Combine all splits into one DataFrame
    df = pd.concat([dataset[split].to_pandas() for split in dataset.keys()])

    # Save the DataFrame as a CSV file in the specified directory on Google Drive
    df.to_csv(csv_path, index=False)
else:
    # Load the DataFrame from the CSV file
    df = pd.read_csv(csv_path)

# Split the data into training and validation sets
df_subset, val_dataset = train_test_split(df, test_size=0.1, random_state=42)
val_dataset.to_csv(f"{base_path}/validation_set.csv", index=False)  # Save validation set

# Check if subsets already exist, if not create and save them
if not all(os.path.exists(path) for path in subset_paths):
    split_size = len(df_subset) // 4
    for i, subset_path in enumerate(subset_paths):
        subset = df_subset.iloc[i*split_size: (i+1)*split_size]
        subset.to_csv(subset_path, index=False)

    # Free memory
    del df
    gc.collect()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/2.63k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/127M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
# Use a small, fast model for quick training (DistilBERT)
# Check if a model checkpoint exists in your drive
if os.path.exists(trained_model_path) and os.listdir(trained_model_path):
    print(f"Loading model checkpoint from {trained_model_path}")
    model = AutoModelForSequenceClassification.from_pretrained(trained_model_path)
    tokenizer = AutoTokenizer.from_pretrained(trained_model_path)
else:
    # Initialize the model if a checkpoint doesn't exist
    model_name = "distilbert-base-uncased"
    config = DistilBertConfig.from_pretrained(model_name)
    config.num_labels = 2
    model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
    tokenizer = AutoTokenizer.from_pretrained(model_name)


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
# Define dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, idx):
        text = self.texts[idx]
        # Tokenize the text on-the-fly
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        # Convert the encoding to a format suitable for PyTorch
        item = {key: val.squeeze(0) for key, val in encoding.items()}  # Squeeze is used to remove batch dimension
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

    def get_labels(self):
        return self.labels

def calculate_f1_score(predictions, true_labels):
    # Calculate the F1 score between the true labels and predictions
    f1 = f1_score(true_labels, predictions, average='weighted')  # Use 'weighted' if you have an imbalanced dataset
    return f1

# Define custom callback class
class CustomCallback(TrainerCallback):
    def __init__(self, trainer):
        self.trainer = trainer

    def on_evaluate(self, args, state, control, model, tokenizer, eval_dataset=None, **kwargs):
        predictions = trainer.predict(eval_dataset).predictions.argmax(axis=1)
        true_labels = eval_dataset.get_labels()
        f1_score = calculate_f1_score(predictions, true_labels)
        self.log_metrics("eval_f1", f1_score)

In [8]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=checkpoint_path,
    num_train_epochs=1,  # We'll manually loop over epochs
    per_device_train_batch_size=16,  # Adjust based on your GPU memory
    gradient_accumulation_steps=2,  # Increase if using a larger effective batch size
    evaluation_strategy="epoch",
    logging_dir=logs_path,
    logging_steps=50,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [9]:
# Training
print("Preparing training data.")
# Define training datasets preparation function
def prepare_training_data(df_subset):
    train_labels = [0] * len(df_subset['wiki_intro']) + [1] * len(df_subset['generated_intro'])
    train_texts = df_subset['wiki_intro'].tolist() + df_subset['generated_intro'].tolist()
    train_dataset = TextDataset(train_texts, train_labels, tokenizer)
    return train_dataset

# Evaluation
print("Preparing validation data for final evaluation.")
# Function to prepare validation data
def prepare_validation_data(val_dataset):
    val_texts = val_dataset['wiki_intro'].tolist() + val_dataset['generated_intro'].tolist()
    val_labels = [0] * len(val_dataset['wiki_intro']) + [1] * len(val_dataset['generated_intro'])
    val_dataset = TextDataset(val_texts, val_labels, tokenizer)
    return val_dataset

Preparing training data.
Preparing validation data for final evaluation.


In [10]:
# Define a function to check GPU memory usage
def get_gpu_memory_usage():
    result = subprocess.check_output(["nvidia-smi", "--query-gpu=memory.used", "--format=csv,noheader,nounits"])
    memory_used = int(result)
    return memory_used

In [None]:
# Define the number of manual epochs and initialize a list to store checkpoint paths
num_manual_epochs = 4  # Set the number of manual epochs
all_checkpoint_paths = []

# Determine the starting epoch based on existing checkpoints
starting_epoch = 0
for i in range(num_manual_epochs):
    if os.path.exists(f"{trained_model_path}/manual_epoch_{i + 1}"):
        starting_epoch = i + 1

# Looping over the manual epochs starting from the determined epoch
for epoch in range(starting_epoch, num_manual_epochs):
    # Set the batch size for this epoch
    current_batch_size = training_args.per_device_train_batch_size
    while True:

        print(f"Starting manual epoch {epoch + 1}/{num_manual_epochs}")
        timestamp = datetime.now().strftime("%Y%m%d%H%M%S")  # Current time as a unique string

        # Determine subset index based on the epoch
        subset_index = epoch % len(subset_paths)  # Calculate the subset index

        # Load and prepare data for the current epoch
        subset_path = subset_paths[subset_index]
        print(f"Loading and preparing data for subset {subset_index + 1}")
        current_df = pd.read_csv(subset_path)
        current_train_dataset = prepare_training_data(current_df)

        # Load the model from the last checkpoint at the start of each epoch, if it exists
        checkpoint_directory = f"{trained_model_path}/manual_epoch_{epoch}_{timestamp}"
        if os.path.exists(checkpoint_directory):
            print(f"Checkpoint found at {checkpoint_directory}. Loading model from this checkpoint.")
            model = AutoModelForSequenceClassification.from_pretrained(checkpoint_directory, config=config)
            tokenizer = AutoTokenizer.from_pretrained(checkpoint_directory)
        else:
            print(f"No checkpoint found for epoch {epoch + 1}. Starting with the base model.")

        # Initialize the Trainer with the current training dataset
        print("Initializing Trainer with the current dataset.")
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=current_train_dataset,
            tokenizer=tokenizer
        )

        # Initialize the callback with the trainer
        callback = CustomCallback(trainer)
        trainer.add_callback(callback)

        # Train the model for one epoch on the current dataset
        print("Starting training...")
        trainer.train()

        # Check GPU memory usage
        gpu_memory_usage = get_gpu_memory_usage()

        # If memory usage exceeds the threshold, reduce batch size
        if gpu_memory_usage > 0.9:
            training_args.per_device_train_batch_size = max(1, training_args.per_device_train_batch_size // 2)  # Reduce by half

        # If memory usage is within the threshold or batch size is 1, break the loop
        if gpu_memory_usage <= 0.9 or training_args.per_device_train_batch_size == 1:
            break

        # Optionally, evaluate the model after the epoch
        print("Evaluating model after training on current subset.")
        results = trainer.evaluate()
        print(f"Evaluation results: {results}")

        # After completing the training for the current epoch, save the model
        checkpoint_directory = f"{trained_model_path}/manual_epoch_{epoch}_subset_{subset_index + 1}_{timestamp}"
        print(f"Saving model and tokenizer after manual epoch {epoch + 1}, subset {subset_index + 1}")
        trainer.save_model(checkpoint_directory)
        tokenizer.save_pretrained(checkpoint_directory)

        # Add the checkpoint directory to the list
        all_checkpoint_paths.append(checkpoint_directory)

        # Free up memory
        print("Freeing up memory.")
        del current_df, current_train_dataset
        gc.collect()
        torch.cuda.empty_cache()

Starting manual epoch 1/4
Loading and preparing data for subset 1
No checkpoint found for epoch 1. Starting with the base model.
Initializing Trainer with the current dataset.
Starting training...


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [None]:
# Evaluation section
val_dataset = pd.read_csv(f"{base_path}/validation_set.csv")
val_dataset = prepare_validation_data(val_dataset)

# Function to initialize the Trainer for evaluation
def initialize_trainer_for_evaluation(model, training_args, eval_dataset):
    return Trainer(
        model=model,
        args=training_args,
        eval_dataset=eval_dataset,
    )

# Load the final model for evaluation after all epochs
final_model_path = all_checkpoint_paths[-1] if all_checkpoint_paths else None

if final_model_path:
    print("Loading final model for evaluation from:", final_model_path)
    model = AutoModelForSequenceClassification.from_pretrained(final_model_path, config=config)
    tokenizer = AutoTokenizer.from_pretrained(final_model_path)

    # Initialize the Trainer for final evaluation with the test dataset
    eval_trainer = initialize_trainer_for_evaluation(model, training_args, val_dataset)

    # Final evaluation of the model
    print("Evaluating the final model.")
    final_results = eval_trainer.evaluate()
    print("Final evaluation results:", final_results)
else:
    print("No model checkpoint found for evaluation.")
