<a href="https://colab.research.google.com/github/Di9mar/ada4b/blob/main/Wiki_Classification_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
# Source:
# https://huggingface.co/datasets/aadityaubhat/GPT-wiki-intro/viewer/default/train

In [None]:
# Upgrade pip
!pip install --upgrade pip

# Install required packages
!pip install datasets transformers torch scikit-learn accelerate

# If you specifically need the 'torch' extras from transformers
!pip install transformers[torch] --upgrade

In [22]:
from datasets import load_dataset
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, DistilBertConfig, TrainerCallback
from torch.utils.data import Dataset
import os
import gc
from datetime import datetime
import subprocess
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, roc_auc_score, f1_score
import numpy as np
import json
import shutil

In [23]:
# Import the 'drive' module from the 'google.colab' library
# This module allows you to mount your Google Drive in the Colab environment.
# Make sure you have the necessary authorization to access your Drive.
# If not already installed, you may need to install the 'google-colab' package.
from google.colab import drive

# Mount Google Drive to '/content/drive'
# This will make your Google Drive files accessible from within the Colab environment.
# You'll be prompted to authenticate and grant necessary permissions.
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
# Define paths
model_name = "wiki"
# These variables store the paths to different directories and files.
# They are used for organization and to ensure you have the correct paths when needed.
base_path = "/content/drive/My Drive/ColabData"
checkpoint_path = f"{base_path}/checkpoints"
trained_model_path = f"{base_path}/trained_model"
logs_path = f"{base_path}/logs"
csv_path = f"{base_path}/wiki_data.csv"
subset_paths = [f"{base_path}/subset_{i}.csv" for i in range(1, 5)]  # For training subsets and validation set

# Create directories if they don't exist
# This section checks if the specified directories exist, and if not, it creates them.
os.makedirs(base_path, exist_ok=True)
os.makedirs(checkpoint_path, exist_ok=True)
os.makedirs(trained_model_path, exist_ok=True)
os.makedirs(logs_path, exist_ok=True)

# Check if the main CSV file exists and load or create it
# This code checks if the main CSV file (wiki_data.csv) exists. If not, it loads data from a specified source
# and saves it as a CSV file in the specified directory on Google Drive.
if not os.path.exists(csv_path):
    # Load data from directory on huggingface.co
    dataset = load_dataset("aadityaubhat/GPT-wiki-intro")

    # Combine all splits into one DataFrame
    df = pd.concat([dataset[split].to_pandas() for split in dataset.keys()])

    # Save the DataFrame as a CSV file in the specified directory on Google Drive
    df.to_csv(csv_path, index=False)
else:
    # Load the DataFrame from the CSV file
    df = pd.read_csv(csv_path)

# Split the data into training and validation sets
# Here, the main DataFrame is split into a training set (df_subset) and a validation set (val_dataset).
df_subset, val_dataset = train_test_split(df, test_size=0.1, random_state=42)

# Save the validation set as a separate CSV file
val_dataset.to_csv(f"{base_path}/validation_set.csv", index=False)

# Check if subsets already exist, if not create and save them
# This section checks if the training subsets (subset_1.csv, subset_2.csv, subset_3.csv) exist.
# If not, it splits the training data into these subsets and saves them as CSV files.
if not all(os.path.exists(path) for path in subset_paths):
    split_size = len(df_subset) // 4
    for i, subset_path in enumerate(subset_paths):
        subset = df_subset.iloc[i*split_size: (i+1)*split_size]
        subset.to_csv(subset_path, index=False)

    # Free memory by deleting the original DataFrame and performing garbage collection
    del df
    gc.collect()

In [25]:
# Define dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, idx):
        text = self.texts[idx]

        # Tokenize the text on-the-fly
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        # Convert the encoding to a format suitable for PyTorch
        item = {key: val.squeeze(0) for key, val in encoding.items()}  # Squeeze is used to remove batch dimension
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

    def get_labels(self):
        return self.labels

In [26]:
# Function to Calculate Metrics
def calculate_evaluation_metrics(predictions, true_labels):
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
    conf_matrix = confusion_matrix(true_labels, predictions)
    roc_auc = roc_auc_score(true_labels, predictions)  # For binary classification

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'F1_score': f1,
        'confusion_matrix': conf_matrix.tolist(),
        'ROC_AUC': roc_auc
    }

In [27]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=checkpoint_path,
    num_train_epochs=1,                    # Number of training epochs (You may adjust this)
    per_device_train_batch_size=16,        # Batch size per GPU (Adjust based on your GPU memory)
    gradient_accumulation_steps=2,         # Increase if using a larger effective batch size
    evaluation_strategy="epoch",           # Evaluation frequency (e.g., "steps" or "epoch")
    logging_dir=logs_path,                 # Directory for logs
    logging_steps=50,                      # Log training progress every N steps
    save_strategy="epoch",                 # Save checkpoints every N epochs
    load_best_model_at_end=True,           # Load the best model at the end of training
)

In [28]:
# Training
print("Preparing training data.")

# Define a function to prepare training datasets
def prepare_training_data(df_subset):
    train_labels = [0] * len(df_subset['wiki_intro']) + [1] * len(df_subset['generated_intro'])
    train_texts = df_subset['wiki_intro'].tolist() + df_subset['generated_intro'].tolist()

    # Create a training dataset using the TextDataset class
    train_dataset = TextDataset(train_texts, train_labels, tokenizer)
    return train_dataset

# Evaluation
print("Preparing validation data for final evaluation.")

# Define a function to prepare validation datasets
def prepare_validation_data(val_dataset):
    val_texts = val_dataset['wiki_intro'].tolist() + val_dataset['generated_intro'].tolist()
    val_labels = [0] * len(val_dataset['wiki_intro']) + [1] * len(val_dataset['generated_intro'])

    # Create a validation dataset using the TextDataset class
    val_dataset = TextDataset(val_texts, val_labels, tokenizer)
    return val_dataset

Preparing training data.
Preparing validation data for final evaluation.


In [29]:
# Prepare the subsets for validation
val_dataset = prepare_validation_data(val_dataset)

# Splitting 0.5% of data for training and another 0.5% for validation (optional)
# _, small_val_dataset = train_test_split(val_dataset, test_size=0.005, random_state=43)
# val_dataset = prepare_validation_data(small_val_dataset)

# Verify Dataset Initialization
if val_dataset is None or len(val_dataset) == 0:
    raise ValueError("Validation dataset is empty or not initialized.")

# Debug prints to check datasets
print("Validation dataset size:", len(val_dataset))

Validation dataset size: 30000


In [30]:
# Use a small, fast model for quick training (DistilBERT)

# Check if a model checkpoint exists in your drive
if os.path.exists(trained_model_path) and os.listdir(trained_model_path):
    # If a pre-trained model checkpoint exists in the specified directory, load it
    print(f"Loading model checkpoint from {trained_model_path}")
    model = AutoModelForSequenceClassification.from_pretrained(trained_model_path)
    tokenizer = AutoTokenizer.from_pretrained(trained_model_path)
    print("Model checkpoint loaded successfully.")
else:
    # If no pre-trained model checkpoint exists, initialize a new model
    model_name = "distilbert-base-uncased"
    config = DistilBertConfig.from_pretrained(model_name)
    config.num_labels = 2  # Assuming you have a binary classification task
    model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Initialized a new model.")

# Print the model configuration for reference
print(f"Model Configuration:\n{model.config}")

# Print the tokenizer information
print(f"Tokenizer Information:\n{tokenizer}")

# Initialize the Trainer
trainer = Trainer(
    model=model,                 # Initially, set with the base model
    args=training_args,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initialized a new model.
Model Configuration:
DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.36.2",
  "vocab_size": 30522
}

Tokenizer Information:
DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=Fals

In [89]:
# Define the number of manual epochs and initialize a list to store checkpoint paths
num_manual_epochs = 4  # Set the number of manual epochs
all_checkpoint_paths = []
epoch_metrics = {}
all_metrics_paths = []

# Initialize the starting_epoch to 0
starting_epoch = 0
# Determine the starting epoch based on existing checkpoints
while True:
    starting_epoch += 1
    checkpoint_directory = f"{checkpoint_path}/manual_epoch_{starting_epoch}"
    if not os.path.exists(checkpoint_directory):
        break

# Print the determined starting epoch
print(f"Starting epoch: {starting_epoch}")

# Looping over the manual epochs starting from the determined epoch
for epoch in range(starting_epoch - 1, num_manual_epochs):
    # Correctly set checkpoint_directory for the current epoch
    checkpoint_directory = f"{checkpoint_path}/manual_epoch_{epoch + 1}"
    print(f"Starting manual epoch {epoch + 1}/{num_manual_epochs}")

    # Determine subset index based on the epoch and starting_epoch
    subset_index = (epoch + 1) % len(subset_paths)  # Calculate the subset index

    # Load and prepare data for the current epoch
    subset_path = subset_paths[subset_index]
    print(f"Loading and preparing data for subset {subset_index}")
    current_df = pd.read_csv(subset_path)
    current_train_dataset = prepare_training_data(current_df)

    # Splitting 0.2% of data for training
    # _, small_train_dataset = train_test_split(current_df, test_size=0.002, random_state=42)
    # Prepare the subsets for training and validation
    # current_train_dataset = prepare_training_data(small_train_dataset)

    # Verify Dataset Initialization
    if current_train_dataset is None or len(current_train_dataset) == 0:
        raise ValueError("Training dataset is empty or not initialized.")

    # Debug prints to check datasets
    print("Training dataset size:", len(current_train_dataset))

    # Check if the specific checkpoint for this epoch exists
    if os.path.exists(checkpoint_directory) and epoch != 0:
        previous_checkpoint_directory = f"{checkpoint_path}/manual_epoch_{epoch}"
        print(f"Loading checkpoint for epoch {epoch + 1} from {previous_checkpoint_directory}")
        model = AutoModelForSequenceClassification.from_pretrained(previous_checkpoint_directory)
        tokenizer = AutoTokenizer.from_pretrained(previous_checkpoint_directory)
        trainer.model = model
        trainer.tokenizer = tokenizer
    else:
        # If no specific checkpoint found, use the base DistilBERT model
        print(f"No checkpoint found for epoch {epoch}, continuing with base model. Checkpoint directory: {checkpoint_directory}")
        model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", config=config)
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

    # Update the Trainer's datasets for the current epoch
    trainer.train_dataset = current_train_dataset
    trainer.eval_dataset = val_dataset

    # Train the model for one epoch on the current dataset
    print("Starting training...")
    trainer.train()

    # # Optionally, evaluate the model after the epoch
    # print("Evaluating model after training on current subset.")
    # results = trainer.evaluate()
    # print(f"Evaluation results: {results}")

    # # Check if "predictions" and "label_ids" exist in the results dictionary
    # if 0 in results and 1 in results:
    #     predictions = np.argmax(results["predictions"], axis=1)
    #     true_labels = results["label_ids"]
    # else:
    #     print("Warning: Predictions and label_ids are not available for this evaluation.")
    #     predictions = None
    #     true_labels = None

    # # Calculate metrics if true_labels are available
    # if true_labels is not None:
    #     # Calculate metrics
    #     epoch_metrics = calculate_evaluation_metrics(predictions, true_labels)
    # else:
    #     # Handle the case when true_labels is None (e.g., print a message or skip metrics calculation)
    #     print("True labels are not available for this evaluation.")

    # After completing the training for the current epoch, save the model
    print(f"Saving model and tokenizer after manual epoch {epoch + 1}, subset {subset_index + 1}")
    trainer.save_model(checkpoint_directory)
    tokenizer.save_pretrained(checkpoint_directory)

    # Save metrics for this epoch
    metrics_path = f"{logs_path}/metrics_epoch_{epoch + 1}.json"
    with open(metrics_path, 'w') as file:
        json.dump(epoch_metrics, file, indent=4)

    # Add the path to the metrics file to the list for later use
    all_metrics_paths.append(metrics_path)

    # Add the checkpoint directory to the list
    all_checkpoint_paths.append(checkpoint_directory)

    # Free up memory
    print("Freeing up memory.")
    del current_df, current_train_dataset
    gc.collect()
    torch.cuda.empty_cache()

print("Training process complete.")

Starting epoch: 2
Starting manual epoch 3/4
Loading and preparing data for subset 3
Training dataset size: 67500
No checkpoint found for epoch 3, continuing with base model. Checkpoint directory: /content/drive/My Drive/ColabData/checkpoints/manual_epoch_3


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# After completing all epochs, save the final model state
final_checkpoint_directory = f"{trained_model_path}{model_name}"
print(f"Saving final model and tokenizer to {final_checkpoint_directory}")

# Save the final model and tokenizer
trainer.save_model(final_checkpoint_directory)
tokenizer.save_pretrained(final_checkpoint_directory)

# Clean up intermediate checkpoint directories
# for i in range(num_manual_epochs):
#     checkpoint_directory = f"{checkpoint_path}/manual_epoch_{i + 1}"
#     if os.path.exists(checkpoint_directory):
#         shutil.rmtree(checkpoint_directory)
#         print(f"Deleted checkpoint directory: {checkpoint_directory}")

# print("Final model and checkpoint directories saved. Intermediate checkpoints deleted.")

if os.path.exists(checkpoint_path):
    shutil.rmtree(checkpoint_path)
    print(f"Deleted entire checkpoint directory: {checkpoint_path}")
os.makedirs(checkpoint_path, exist_ok=True)

In [None]:
# Function to initialize the Trainer for evaluation
def initialize_trainer_for_evaluation(model, training_args, eval_dataset):
    """
    Initialize a Trainer object for model evaluation.

    Args:
        model (PreTrainedModel): The pre-trained model to evaluate.
        training_args (TrainingArguments): Training arguments for evaluation.
        eval_dataset (Dataset): The evaluation dataset.

    Returns:
        Trainer: A Trainer object for evaluation.
    """
    return Trainer(
        model=model,
        args=training_args,
        eval_dataset=eval_dataset,
    )

# Function to get the next available file name
def get_next_file_name(file_prefix):
    """
    Get the next available file name by appending an index.

    Args:
        file_prefix (str): The prefix for the file name.

    Returns:
        str: The next available file name.
    """
    index = 0
    while True:
        file_name = f"{file_prefix}_{index}.json"
        if not os.path.exists(file_name):
            return file_name
        index += 1

In [None]:
# Load the final model for evaluation after all epochs
final_model_path = all_checkpoint_paths[-1] if all_checkpoint_paths else None

if final_model_path:
    print("Loading final model for evaluation from:", final_checkpoint_directory)
    model = AutoModelForSequenceClassification.from_pretrained(final_checkpoint_directory, config=config)
    tokenizer = AutoTokenizer.from_pretrained(final_checkpoint_directory)

    # Initialize the Trainer for final evaluation with the validation dataset
    eval_trainer = initialize_trainer_for_evaluation(model, training_args, val_dataset)

    # Predict on the validation dataset
    print("Predicting on the validation dataset.")
    predictions = eval_trainer.predict(val_dataset)

    # Extract the predicted labels from the predictions
    final_predictions = np.argmax(predictions.predictions, axis=1)
    final_true_labels = val_dataset.get_labels()  # Get the true labels from the validation dataset

    # Calculate final evaluation metrics
    final_metrics = calculate_evaluation_metrics(final_predictions, final_true_labels)

    # Save final metrics with a sequentially numbered file name
    final_metrics_path = get_next_file_name(f"{logs_path}/final_evaluation_metrics")
    try:
        with open(final_metrics_path, 'w') as file:
            json.dump(final_metrics, file, indent=4)
            print(f"Final Evaluation Metrics Saved as {final_metrics_path}")
    except Exception as e:
        print(f"An error occurred while saving the final metrics: {str(e)}")

    # Add the path to the final evaluation metrics file to the list for later use
    all_metrics_paths.append(final_metrics_path)

    # Print the final evaluation metrics
    print("Final Evaluation Metrics:")
    print("Accuracy:", final_metrics['accuracy'])
    print("Precision:", final_metrics['precision'])
    print("Recall:", final_metrics['recall'])
    print("F1 Score:", final_metrics['F1_score'])
    print("Confusion Matrix:")
    print(final_metrics['confusion_matrix'])
    print("ROC AUC:", final_metrics['ROC_AUC'])

    # Print the last saved final metrics
    if os.path.exists(final_metrics_path):
        print(f"Last Saved Final Metrics ({final_metrics_path}):")
        with open(final_metrics_path, 'r') as file:
            last_saved_metrics = json.load(file)
            print("Accuracy:", last_saved_metrics['accuracy'])
            print("Precision:", last_saved_metrics['precision'])
            print("Recall:", last_saved_metrics['recall'])
            print("F1 Score:", last_saved_metrics['F1_score'])
            print("Confusion Matrix:")
            print(last_saved_metrics['confusion_matrix'])
            print("ROC AUC:", last_saved_metrics['ROC_AUC'])
else:
    print("No model checkpoint found for evaluation.")