In [24]:
# Step 0: Install Required Libraries
!pip install transformers datasets

# Step 1: Import Libraries
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset
import torch



In [25]:
# Step 2: Prepare Your Dataset
# Example dataset (replace with your actual data)
data = [
    {
        "input": "object: bus, distance: 15 meters, motion: approaching",
        "output": "Warning: A bus is approaching you, approximately 15 meters away. Stay alert."
    },
    {
        "input": "object: bicycle, distance: 3 meters, motion: crossing path",
        "output": "Warning: A bicycle is crossing your path, 3 meters ahead. Proceed with caution."
    },
    {
        "input": "object: pole, distance: 1 meter, motion: stationary",
        "output": "Warning: A pole is in front of you, 1 meter ahead. Adjust your path accordingly."
    },
    {
        "input": "object: child, distance: 2 meters, motion: running",
        "output": "Warning: A child is running near you, 2 meters away. Be careful."
    },
    {
        "input": "object: dog, distance: 4 meters, motion: running towards",
        "output": "Warning: A dog is running towards you, 4 meters ahead. Stay cautious."
    },
    {
        "input": "object: traffic light pole, distance: 0.5 meters, motion: stationary",
        "output": "Warning: A traffic light pole is very close, just 0.5 meters ahead. Step carefully."
    },
    {
        "input": "object: group of people, distance: 6 meters, motion: moving towards",
        "output": "Warning: A group of people is approaching you, 6 meters ahead. Be prepared."
    },
    {
        "input": "object: construction barrier, distance: 2 meters, motion: stationary",
        "output": "Warning: A construction barrier is blocking your path, 2 meters ahead. Change direction if needed."
    },
    {
        "input": "object: puddle, distance: 0.8 meters, motion: stationary",
        "output": "Warning: A puddle is ahead, 0.8 meters away. Be mindful of slipping."
    },
    {
        "input": "object: car, distance: 5 meters, motion: reversing",
        "output": "Warning: A car is reversing nearby, 5 meters behind you. Stay alert."
    },
    {
        "input": "object: electric wheelchair, distance: 2.5 meters, motion: moving towards",
        "output": "Warning: An electric wheelchair is approaching you, about 2.5 meters away."
    },
    {
        "input": "object: chair, distance: 0.3 meters, motion: stationary",
        "output": "Warning: A chair is right in front of you, only 0.3 meters away."
    },
    {
        "input": "object: bird, distance: 1 meter, motion: flying low",
        "output": "Warning: A bird is flying low near you, 1 meter away."
    },
    {
        "input": "object: ice patch, distance: 0.5 meters, motion: slippery",
        "output": "Warning: There is an ice patch 0.5 meters ahead. Watch your step."
    },
    {
        "input": "object: shopping cart, distance: 1.5 meters, motion: being pushed",
        "output": "Warning: A shopping cart is moving near you, 1.5 meters ahead. Be cautious."
    }
]


# Convert to Hugging Face Dataset format
dataset = Dataset.from_dict({
    "input": [item["input"] for item in data],
    "output": [item["output"] for item in data]
})

In [39]:
# Step 3: Load Pretrained Model and Tokenizer
model_name = "distilgpt2"  # You can use "gpt2-medium", "gpt2-large", or other models
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add a padding token if not already present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [41]:
# Step 4: Tokenize the Dataset
def tokenize_function(examples):
    # Tokenize the input and output separately
    inputs = tokenizer(examples["input"], padding="max_length", truncation=True, max_length=32)
    outputs = tokenizer(examples["output"], padding="max_length", truncation=True, max_length=32)

    # Return input_ids, attention_mask, and labels
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": outputs["input_ids"],  # Labels are the tokenized outputs
    }

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

In [42]:
# Step 5: Set Up Training Arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",  # Directory to save the fine-tuned model
    overwrite_output_dir=True,
    num_train_epochs=10,              # Number of training epochs
    per_device_train_batch_size=2,   # Batch size
    save_steps=500,                  # Save model every 500 steps
    save_total_limit=2,              # Keep only the last 2 saved models
    logging_dir="./logs",            # Directory for logs
    logging_steps=100,               # Log every 100 steps
    evaluation_strategy="no",        # No evaluation during training
    learning_rate=3e-5,              # Learning rate
    weight_decay=0.01,               # Weight decay
    warmup_steps=100,                # Warmup steps
    prediction_loss_only=True,       # Only compute loss during training
    report_to="none",                # Disable W&B logging
)



In [43]:
# Step 6: Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [44]:
# Step 7: Fine-Tune the Model
trainer.train()

Step,Training Loss


TrainOutput(global_step=80, training_loss=8.832852172851563, metrics={'train_runtime': 160.6428, 'train_samples_per_second': 0.934, 'train_steps_per_second': 0.498, 'total_flos': 1224828518400.0, 'train_loss': 8.832852172851563, 'epoch': 10.0})

In [45]:
# Step 8: Save the Fine-Tuned Model
trainer.save_model("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")



('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json')

In [46]:
# Step 9: Load the Fine-Tuned Model for Inference
fine_tuned_model = GPT2LMHeadModel.from_pretrained("./fine_tuned_model")
fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained("./fine_tuned_model")

In [47]:
# Step 10: Generate Warnings
def generate_warning(input_text):
    input_ids = fine_tuned_tokenizer.encode(input_text, return_tensors="pt")
    output = fine_tuned_model.generate(
        input_ids,
        max_length=64,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
    )
    return fine_tuned_tokenizer.decode(output[0], skip_special_tokens=True)

# Example Usage
input_text = "object: car, distance: 20 meters, motion: left"
warning = generate_warning(input_text)
print(warning)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


object: car, distance: 20 meters, motion: left.

The car is a car. It is the car of the future. The car will be the vehicle of future generations.
