In [None]:
# Source:
# https://huggingface.co/datasets/aadityaubhat/GPT-wiki-intro/viewer/default/train

In [None]:
# Upgrade pip
!pip install --upgrade pip

# Install required packages
!pip install datasets transformers torch scikit-learn accelerate

# If you specifically need the 'torch' extras from transformers
!pip install transformers[torch] --upgrade

In [2]:
from datasets import load_dataset
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, DistilBertConfig, TrainerCallback
from torch.utils.data import Dataset
import os
import gc
from datetime import datetime
import subprocess
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, roc_auc_score, f1_score
import numpy as np
import json
import shutil

In [3]:
# Import the 'drive' module from the 'google.colab' library
# This module allows you to mount your Google Drive in the Colab environment.
# Make sure you have the necessary authorization to access your Drive.
# If not already installed, you may need to install the 'google-colab' package.
from google.colab import drive

# Mount Google Drive to '/content/drive'
# This will make your Google Drive files accessible from within the Colab environment.
# You'll be prompted to authenticate and grant necessary permissions.
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
# Define paths
trained_model = "wiki"
new_model = "essay"
data_file = "essay_data"

# These variables store the paths to different directories and files.
# They are used for organization and to ensure you have the correct paths when needed.
base_path = "/content/drive/My Drive/TEST"
checkpoint_path = f"{base_path}/checkpoints"
trained_model_path = f"{base_path}/{trained_model}"
new_model_path = f"{base_path}/{new_model}"
logs_path = f"{base_path}/logs"
csv_path = f"{base_path}/{data_file}.csv"
subset_paths = [f"{base_path}/subset_{i}.csv" for i in range(1, 5)]  # For training subsets and validation set

# Create directories if they don't exist
# This section checks if the specified directories exist, and if not, it creates them.
os.makedirs(base_path, exist_ok=True)
os.makedirs(checkpoint_path, exist_ok=True)
os.makedirs(trained_model_path, exist_ok=True)
os.makedirs(logs_path, exist_ok=True)

In [10]:
# Load new data
try:
    df = pd.read_csv(csv_path, delimiter=';')
    load_success = True
except Exception as e:
    load_success = False
    df = None
    error_message = str(e)

load_success, df if df is not None else error_message

# Check if df is loaded successfully before proceeding
if load_success:
    # Remove excess newline characters
    df['human'] = df['human'].str.replace(r'\n+', '\n')
    df['ai'] = df['ai'].str.replace(r'\n+', '\n')
    # Split the main DataFrame into a training set and a validation set
    df_subset, val_dataset = train_test_split(df, test_size=0.1, random_state=42)
    # Proceed with your code using df_subset and val_dataset
else:
    # Handle the error (e.g., print the error message or log it)
    print(error_message)

  df['human'] = df['human'].str.replace(r'\n+', '\n')
  df['ai'] = df['ai'].str.replace(r'\n+', '\n')


In [7]:
# Define dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, idx):
        text = self.texts[idx]

        # Tokenize the text on-the-fly
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        # Convert the encoding to a format suitable for PyTorch
        item = {key: val.squeeze(0) for key, val in encoding.items()}  # Squeeze is used to remove batch dimension
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

    def get_labels(self):
        return self.labels

In [8]:
# Function to Calculate Metrics
def calculate_evaluation_metrics(predictions, true_labels):
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
    conf_matrix = confusion_matrix(true_labels, predictions)
    roc_auc = roc_auc_score(true_labels, predictions)  # For binary classification

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'F1_score': f1,
        'confusion_matrix': conf_matrix.tolist(),
        'ROC_AUC': roc_auc
    }

In [9]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=checkpoint_path,
    num_train_epochs=1,                    # Number of training epochs (You may adjust this)
    per_device_train_batch_size=16,        # Batch size per GPU (Adjust based on your GPU memory)
    gradient_accumulation_steps=2,         # Increase if using a larger effective batch size
    evaluation_strategy="epoch",           # Evaluation frequency (e.g., "steps" or "epoch")
    logging_dir=logs_path,                 # Directory for logs
    logging_steps=50,                      # Log training progress every N steps
    save_strategy="epoch",                 # Save checkpoints every N epochs
    load_best_model_at_end=True,           # Load the best model at the end of training
)

In [11]:
# Training
print("Preparing training data.")

# Define a function to prepare training datasets
def prepare_training_data(df_subset):
    train_labels = [0] * len(df_subset['human']) + [1] * len(df_subset['ai'])
    train_texts = df_subset['human'].tolist() + df_subset['ai'].tolist()

    # Create a training dataset using the TextDataset class
    train_dataset = TextDataset(train_texts, train_labels, tokenizer)
    return train_dataset

# Evaluation
print("Preparing validation data for final evaluation.")

# Define a function to prepare validation datasets
def prepare_validation_data(val_dataset):
    val_labels = [0] * len(val_dataset['human']) + [1] * len(val_dataset['ai'])
    val_texts = val_dataset['human'].tolist() + val_dataset['ai'].tolist()

    # Create a validation dataset using the TextDataset class
    val_dataset = TextDataset(val_texts, val_labels, tokenizer)
    return val_dataset

Preparing training data.
Preparing validation data for final evaluation.


In [12]:
# Use a small, fast model for quick training (DistilBERT)

# Check if a model checkpoint exists in your drive
if os.path.exists(trained_model_path) and os.listdir(trained_model_path):
    # If a pre-trained model checkpoint exists in the specified directory, load it
    print(f"Loading model checkpoint from {trained_model_path}")
    model = AutoModelForSequenceClassification.from_pretrained(trained_model_path)
    tokenizer = AutoTokenizer.from_pretrained(trained_model_path)
    print("Model checkpoint loaded successfully.")
else:
    # If no pre-trained model checkpoint exists, initialize a new model
    model_name = {"distilbert-base-uncased"}
    config = DistilBertConfig.from_pretrained(model_name)
    config.num_labels = 2  # Assuming you have a binary classification task
    model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Initialized a new model.")

# Print the model configuration for reference
print(f"Model Configuration:\n{model.config}")

# Print the tokenizer information
print(f"Tokenizer Information:\n{tokenizer}")

# Initialize the Trainer
trainer = Trainer(
    model=model,                 # Initially, set with the base model
    args=training_args,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

Loading model checkpoint from /content/drive/My Drive/ColabData/wiki
Model checkpoint loaded successfully.
Model Configuration:
DistilBertConfig {
  "_name_or_path": "/content/drive/My Drive/ColabData/wiki",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.36.2",
  "vocab_size": 30522
}

Tokenizer Information:
DistilBertTokenizerFast(name_or_path='/content/drive/My Drive/ColabData/wiki', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_

In [13]:
# Prepare the subsets for validation
val_dataset = prepare_validation_data(val_dataset)

# Splitting 0.5% of data for training and another 0.5% for validation (optional)
# _, small_val_dataset = train_test_split(val_dataset, test_size=0.005, random_state=43)
# val_dataset = prepare_validation_data(small_val_dataset)

# Verify Dataset Initialization
if val_dataset is None or len(val_dataset) == 0:
    raise ValueError("Validation dataset is empty or not initialized.")

# Debug prints to check datasets
print("Validation dataset size:", len(val_dataset))

Validation dataset size: 80


In [None]:
# Number of manual epochs
num_manual_epochs = 5

# Checkpoints and metrics array
all_checkpoint_paths = []
all_metrics_paths = []

# Early stopping parameters
best_metric = float('inf')  # Infinity (for minimization problem)
epochs_no_improve = 0
n_epochs_stop = 2  # Number of epochs to wait for improvement before stopping

print(f"Loading and preparing data for {data_file}")
current_df = pd.read_csv(csv_path)
current_train_dataset = prepare_training_data(current_df)

# Verify Dataset Initialization
if current_train_dataset is None or len(current_train_dataset) == 0:
    raise ValueError("Training dataset is empty or not initialized.")

# Initialize the starting_epoch to 0
starting_epoch = 0
# Determine the starting epoch based on existing checkpoints
while True:
    starting_epoch += 1
    checkpoint_directory = f"{checkpoint_path}/manual_epoch_{starting_epoch}"
    if not os.path.exists(checkpoint_directory):
        break

# Print the determined starting epoch
print(f"Starting epoch: {starting_epoch}")

# Looping over the manual epochs starting from the determined epoch
for epoch in range(starting_epoch - 1, num_manual_epochs):
    # Correctly set checkpoint_directory for the current epoch
    checkpoint_directory = f"{checkpoint_path}/manual_epoch_{epoch + 1}"
    print(f"Starting manual epoch {epoch + 1}/{num_manual_epochs}")

    # Check if the specific checkpoint for this epoch exists
    previous_checkpoint_directory = f"{checkpoint_path}/manual_epoch_{epoch}"

    if os.path.exists(previous_checkpoint_directory) and epoch != 0:
        print(f"Loading checkpoint for epoch {epoch + 1} from {previous_checkpoint_directory}")
        model = AutoModelForSequenceClassification.from_pretrained(previous_checkpoint_directory)
        tokenizer = AutoTokenizer.from_pretrained(previous_checkpoint_directory)
        trainer.model = model
        trainer.tokenizer = tokenizer
    else:
        # If no specific checkpoint found, use the base DistilBERT model
        print(f"No checkpoint found for epoch {epoch}, continuing with base model. Checkpoint directory: {previous_checkpoint_directory}")
        model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", config=config)
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

    # Update the Trainer's datasets for the current epoch
    trainer.train_dataset = current_train_dataset
    trainer.eval_dataset = val_dataset

    # Train the model for one epoch on the current dataset
    print("Starting training...")
    trainer.train()

    # After completing the training for the current epoch, save the model
    print(f"Saving model and tokenizer after manual epoch {epoch + 1}, data {data_file}")
    trainer.save_model(checkpoint_directory)
    tokenizer.save_pretrained(checkpoint_directory)

    # Save metrics for this epoch
    metrics_path = f"{logs_path}/{data_file}/metrics_epoch_{epoch + 1}.json"
    with open(metrics_path, 'w') as file:
        json.dump(trainer.state.log_history[ -1], file, indent=4)

    # Add the path to the metrics file to the list for later use
    all_metrics_paths.append(metrics_path)

    # Add the checkpoint directory to the list
    all_checkpoint_paths.append(checkpoint_directory)

    # validation loss is the metric to monitor
    current_metric = trainer.state.log_history[-1]['eval_loss']

    # Check for improvement
    if current_metric < best_metric:
        best_metric = current_metric
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    # Early stopping check
    if epochs_no_improve >= n_epochs_stop:
        print(f"Early stopping triggered after {epoch + 1} epochs.")
        break

    # Free up memory
    print("Freeing up memory.")
    gc.collect()
    torch.cuda.empty_cache()

print("Training process complete.")

Starting epoch: 1
Starting manual epoch 1/4
Loading and preparing data for subset 1


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training dataset size: 67500
No checkpoint found for epoch 0, continuing with base model. Checkpoint directory: /content/drive/My Drive/ColabData/checkpoints/manual_epoch_0


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Starting training...


Epoch,Training Loss,Validation Loss


In [None]:
# After completing all epochs, save the final model state
final_checkpoint_directory = f"{new_model_path}"
print(f"Saving final model and tokenizer to {final_checkpoint_directory}")

# Save the final model and tokenizer
trainer.save_model(final_checkpoint_directory)
tokenizer.save_pretrained(final_checkpoint_directory)

print("Final model and checkpoint directories saved. Intermediate checkpoints deleted.")

if os.path.exists(checkpoint_path):
    shutil.rmtree(checkpoint_path)
    print(f"Deleted entire checkpoint directory: {checkpoint_path}")
os.makedirs(checkpoint_path, exist_ok=True)

In [None]:
# Function to initialize the Trainer for evaluation
def initialize_trainer_for_evaluation(model, training_args, eval_dataset):
    """
    Initialize a Trainer object for model evaluation.

    Args:
        model (PreTrainedModel): The pre-trained model to evaluate.
        training_args (TrainingArguments): Training arguments for evaluation.
        eval_dataset (Dataset): The evaluation dataset.

    Returns:
        Trainer: A Trainer object for evaluation.
    """
    return Trainer(
        model=model,
        args=training_args,
        eval_dataset=eval_dataset,
    )

# Function to get the next available file name
def get_next_file_name(file_prefix, model_name):
    """
    Get the next available file name by appending an index.

    Args:
        file_prefix (str): The prefix for the file name.

    Returns:
        str: The next available file name.
    """
    index = 0
    while True:
        file_name = f"{file_prefix}_{model_name}_{index}.json"
        if not os.path.exists(file_name):
            return file_name
        index += 1

In [None]:
# Load the final model for evaluation after all epochs
final_model_path = all_checkpoint_paths[-1] if all_checkpoint_paths else None

if final_model_path:
    print("Loading final model for evaluation from:", final_checkpoint_directory)
    model = AutoModelForSequenceClassification.from_pretrained(final_checkpoint_directory, config=config)
    tokenizer = AutoTokenizer.from_pretrained(final_checkpoint_directory)

    # Initialize the Trainer for final evaluation with the validation dataset
    eval_trainer = initialize_trainer_for_evaluation(model, training_args, val_dataset)

    # Predict on the validation dataset
    print("Predicting on the validation dataset.")
    predictions = eval_trainer.predict(val_dataset)

    # Extract the predicted labels from the predictions
    final_predictions = np.argmax(predictions.predictions, axis=1)
    final_true_labels = val_dataset.get_labels()  # Get the true labels from the validation dataset

    # Calculate final evaluation metrics
    final_metrics = calculate_evaluation_metrics(final_predictions, final_true_labels)

    # Save final metrics with a sequentially numbered file name
    final_metrics_path = get_next_file_name(f"{logs_path}/final_evaluation_metrics", new_model)
    try:
        with open(final_metrics_path, 'w') as file:
            json.dump(final_metrics, file, indent=4)
            print(f"Final Evaluation Metrics Saved as {final_metrics_path}")
    except Exception as e:
        print(f"An error occurred while saving the final metrics: {str(e)}")

    # Add the path to the final evaluation metrics file to the list for later use
    all_metrics_paths.append(final_metrics_path)

    # Print the final evaluation metrics
    print("Final Evaluation Metrics:")
    print("Accuracy:", final_metrics['accuracy'])
    print("Precision:", final_metrics['precision'])
    print("Recall:", final_metrics['recall'])
    print("F1 Score:", final_metrics['F1_score'])
    print("Confusion Matrix:")
    print(final_metrics['confusion_matrix'])
    print("ROC AUC:", final_metrics['ROC_AUC'])

    # # Print the last saved final metrics
    # if os.path.exists(final_metrics_path):
    #     print(f"Last Saved Final Metrics ({final_metrics_path}):")
    #     with open(final_metrics_path, 'r') as file:
    #         last_saved_metrics = json.load(file)
    #         print("Accuracy:", last_saved_metrics['accuracy'])
    #         print("Precision:", last_saved_metrics['precision'])
    #         print("Recall:", last_saved_metrics['recall'])
    #         print("F1 Score:", last_saved_metrics['F1_score'])
    #         print("Confusion Matrix:")
    #         print(last_saved_metrics['confusion_matrix'])
    #         print("ROC AUC:", last_saved_metrics['ROC_AUC'])
else:
    print("No model checkpoint found for evaluation.")

In [None]:
def load_metrics(file_path):
    """Loads metrics from a given file."""
    with open(file_path, 'r') as file:
        return json.load(file)

def compare_models_metrics(model1, model2, logs_path):
    """Compares metrics between two models."""
    model1_metrics_path = get_metrics_file_name(model1, logs_path)
    model2_metrics_path = get_metrics_file_name(model2, logs_path)

    if os.path.exists(model1_metrics_path) and os.path.exists(model2_metrics_path):
        model1_metrics = load_metrics(model1_metrics_path)
        model2_metrics = load_metrics(model2_metrics_path)

        # Comparison logic (example for accuracy)
        accuracy_change = model1_metrics['accuracy'] - model2_metrics['accuracy']
        return {
            "accuracy_change": accuracy_change,
            # ... other metrics comparisons ...
        }
    else:
        return None

# Example usage for comparing metrics
metrics_comparison = compare_models_metrics(trained_model, new_model, logs_path)
if metrics_comparison:
    print("Metrics Comparison:", metrics_comparison)
else:
    print("Comparison not possible. One or both metrics files are missing.")
