In [1]:
!pip install transformers
!pip install torch
!pip install accelerate -U



In [2]:
import torch
from transformers import Trainer, TrainingArguments

In [3]:
!pip install datasets
from datasets import load_dataset



In [4]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config

In [11]:
import ipywidgets as widgets
from IPython.display import display

# Define your widgets
prompt_input = widgets.Textarea(
    value='',
    placeholder='Type your prompt here',
    description='Prompt:',
    layout={'width': '100%', 'height': '100px'}
)
batch_size_input = widgets.IntText(value=1, description='Batch Size:')
epoch_num_input = widgets.IntText(value=1, description='Epoch Num:')
run_button = widgets.Button(description='Set Values')

# Display your widgets
display(prompt_input, batch_size_input, epoch_num_input, run_button)

# Placeholder variables to store the values
prompt = ''
batch_size = 1
epoch_num = 1

# Define a function to update the variables with current widget values
def on_run_button_clicked(b):
    global prompt, batch_size, epoch_num
    prompt_value = prompt_input.value
    batch_size_value = batch_size_input.value
    epoch_num_value = epoch_num_input.value
    with output:
        print(f"Values set. Prompt: {prompt_value}, Batch Size: {batch_size_value}, Epoch Num: {epoch_num_value}")

# Create an output widget to display the status messages
output = widgets.Output()

run_button.on_click(on_run_button_clicked)
display(output)

Textarea(value='', description='Prompt:', layout=Layout(height='100px', width='100%'), placeholder='Type your â€¦

IntText(value=1, description='Batch Size:')

IntText(value=1, description='Epoch Num:')

Button(description='Set Values', style=ButtonStyle())

Output()

In [8]:
# Load the Microsoft ORCA Math Word Problems dataset
dataset = load_dataset("microsoft/orca-math-word-problems-200k")

# Access the training split of the dataset
training_examples = dataset["train"]

# Tokenize the input questions and answers from the dataset
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set the padding token to the EOS token

# Specify the maximum length for padding or truncation
max_length = 100

subset_size = 5000
subset_data = training_examples[:subset_size]

tokenized_data = [
    tokenizer(question, return_tensors="pt", max_length=max_length, padding="max_length", truncation=True)
    for question in subset_data["question"] # if want to do whole data set, change subset_data to training_example
]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
input_ids = torch.stack([item["input_ids"].squeeze(0) for item in tokenized_data], dim=0)
labels = input_ids.clone()  # Use the same input_ids for labels in language modeling tasks

# Create a GPT-2 configuration
config = GPT2Config.from_pretrained("gpt2")

# Initialize a GPT-2 model with the same configuration
model = GPT2LMHeadModel(config)

In [None]:


# Fine-tune the model on your dataset
training_args = TrainingArguments(
    output_dir="./fine-tuned-model",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    save_steps=10_000,
    save_total_limit=2,
)

# Increase Batch Size
training_args.per_device_train_batch_size = batch_size

# Gradient Accumulation
training_args.gradient_accumulation_steps = 4

# FP16 Training
training_args.fp16 = True

# Reduce Training Epochs
training_args.num_train_epochs = epoch_num

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    data_collator=lambda data: {
        "input_ids": torch.stack([item["input_ids"].squeeze(0) for item in data]),
        "labels": torch.stack([item["input_ids"].squeeze(0) for item in data]),
    }
)

trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine-tuned-model")
tokenizer.save_pretrained("./fine-tuned-model")

Step,Training Loss
500,2.1417


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Specify the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the fine-tuned model and tokenizer
model_name = "gpt2"  # Update with the appropriate model name if different
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Your input question
question = prompt

# Tokenize the input question
input_ids = tokenizer.encode(question, return_tensors="pt")

# Get the length of the input sequence
input_length = len(input_ids[0])

# Generate output with the fine-tuned model
model.config.force_bos_token_to_be_generated = False

# Ensure position_ids are within the valid range
position_ids = torch.arange(input_length, dtype=torch.long, device=device)

# Set a different temperature value
temperature = 0.8

# Mask invalid positions in the attention mask
attention_mask = torch.ones_like(input_ids)
attention_mask[:, input_length:] = 0

# Generate output with the fine-tuned model
generated_ids = model.generate(
    input_ids=input_ids,
    max_length=input_length + 200,
    do_sample=True,
    pad_token_id=model.config.eos_token_id,
    eos_token_id=model.config.eos_token_id,
    attention_mask=attention_mask,
    num_beams=3,
    use_cache=True,
    position_ids=position_ids,
    temperature=temperature,
)

# Extract the generated sequence without the EOS token
generated_sequence = generated_ids[0].tolist()


# Decode the generated sequence
generated_text = tokenizer.decode(generated_sequence, skip_special_tokens=True)
print("Input Question:", question)
print("Generated Answer:", generated_text)
