In [1]:
!pip install transformers datasets
!pip install torch
!pip install accelerate -u


Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

no such option: -u


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset



In [None]:

# Load the dataset
print("Loading dataset...")
dataset = load_dataset("0xAIT/SIDAC", split='train')
print(f"Dataset loaded with {len(dataset)} examples.")


In [None]:
# Load the tokenizer and model
print("Loading GPT-2 tokenizer and model...")
model_name = "gpt2"  # You can also use "distilgpt2" for a smaller option
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
print("Tokenizer and model loaded.")

In [None]:
# Tokenize the dataset
print("Tokenizing dataset...")
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
print("Dataset tokenized.")

In [None]:
# Check the first few tokenized examples
print("Sample tokenized example:", tokenized_datasets[0])

In [None]:
# Set up training arguments
print("Setting up training arguments...")
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save the model
    per_device_train_batch_size=4,  # Adjust based on your GPU memory
    num_train_epochs=3,  # Number of training epochs
    save_steps=10_000,  # Save model every 10,000 steps
    save_total_limit=2,  # Keep only the last 2 models
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=500,  # Log every 500 steps
    evaluation_strategy="steps",  # Evaluate during training
    eval_steps=500,  # Evaluate every 500 steps
)

In [None]:
# Create a Trainer instance
print("Creating Trainer instance...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,  # Adjust based on your dataset
)

In [None]:
# Start training
print("Starting training...")
trainer.train()
print("Training completed.")


In [None]:
# Save the model
print("Saving the model...")
trainer.save_model("/content/drive/MyDrive/LLM_Tasks/ChatBot/GPT-2 model/fine_tuned_gpt2")
print("Model saved successfully.")