<a href="https://colab.research.google.com/github/Di9mar/ada4b/blob/main/text%20classification%20continious%20run.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Upgrade pip
!pip install --upgrade pip

# Install required packages
!pip install datasets transformers torch scikit-learn accelerate

# If you specifically need the 'torch' extras from transformers
!pip install transformers[torch] --upgrade

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Import necessary libraries
import os
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import load_dataset

In [None]:
# Define dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(texts, truncation=True, padding=True)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# Define paths (these should match the paths from your first script)
base_path = "/content/drive/My Drive/ColabData/MyModel"
checkpoint_path = f"{base_path}/checkpoints"
trained_model_path = f"{base_path}/trained_model"
csv_path = f"{base_path}/wiki_data.csv"

In [None]:
# Function to get the last checkpoint, model, and tokenizer if it exists
def get_last_checkpoint(output_dir):
    # Create the directory if it does not exist
    if not os.path.exists(output_dir):
        print(f"The directory {output_dir} does not exist. Creating it now.")
        os.makedirs(output_dir)
        return None  # No checkpoints if the directory has just been created

    checkpoint_dirs = [os.path.join(output_dir, name) for name in os.listdir(output_dir) if os.path.isdir(os.path.join(output_dir, name)) and 'checkpoint' in name]
    if checkpoint_dirs:
        return max(checkpoint_dirs, key=os.path.getmtime)  # returns the path of the latest checkpoint
    else:
        return None

# Determine the starting epoch based on the last checkpoint
last_checkpoint = get_last_checkpoint(checkpoint_path)

# Load the last checkpoint, model, and tokenizer if it exists
model = AutoModelForSequenceClassification.from_pretrained(trained_model_path)
tokenizer = AutoTokenizer.from_pretrained(trained_model_path)

# Print the result
if last_checkpoint:
    print(f"Last checkpoint: {last_checkpoint}")
else:
    print("No checkpoints found.")

In [None]:
# Prepare the dataset (assuming the same fraction and preprocessing as the first run)
df = pd.read_csv(csv_path)

# Use only a fraction of the data for faster training iterations
fraction = 0.50  # Adjust this to use, e.g., 50% of the data

# Calculate the number of samples to include for each class
num_human_samples = int(len(df) * fraction)
num_ai_samples = int(len(df) * fraction)

# Create a balanced dataset with an equal number of human and AI-generated samples
balanced_texts = df['wiki_intro'].tolist()[:num_human_samples] + df['generated_intro'].tolist()[:num_ai_samples]
balanced_labels = [0] * num_human_samples + [1] * num_ai_samples

# Split the balanced dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(balanced_texts, balanced_labels, test_size=0.1)

train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

In [None]:
# Set up Training Arguments and Trainer
training_args=TrainingArguments(
    output_dir=checkpoint_path,
    num_train_epochs=5,  # Adjust as needed
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    evaluation_strategy="epoch",
    logging_dir=f"{base_path}/logs",
    logging_steps=50,
    save_strategy="epoch",
    load_best_model_at_end=True,
    resume_from_checkpoint=last_checkpoint
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    resume_from_checkpoint=last_checkpoint
)

In [None]:
# Continue Training
trainer.train()

# Evaluate and Save Results
eval_results = trainer.evaluate()
print(f"Results after continuing training: {eval_results}")
trainer.save_model(trained_model_path)
tokenizer.save_pretrained(trained_model_path)