# Fine-tune CodeT5 (Salesforce/codet5-small) for FastAPI Code Generation

## Section 1: Install Dependencies

In [None]:
!pip install -q transformers datasets accelerate
!pip install -q huggingface_hub

## Section 2: Upload `.jsonl` Dataset or Load from Google Drive

In [None]:
from google.colab import files
import json

uploaded = files.upload()

import os
import pandas as pd

filename = list(uploaded.keys())[0]
dataset = []
with open(filename, 'r') as f:
    for line in f:
        dataset.append(json.loads(line))

print("Sample record:", dataset[0])

## Section 3: Tokenization

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-small")

max_source_length = 512
max_target_length = 512

def preprocess_data(examples):
    inputs = tokenizer(examples['inputs'], max_length=max_source_length, padding="max_length", truncation=True)
    targets = tokenizer(examples['targets'], max_length=max_target_length, padding="max_length", truncation=True)
    inputs["labels"] = targets["input_ids"]
    return inputs

from datasets import Dataset
hf_dataset = Dataset.from_list(dataset)
tokenized_dataset = hf_dataset.map(preprocess_data, batched=True)

## Section 4: Model Fine-Tuning with Trainer

In [None]:
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer

model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-small")

training_args = TrainingArguments(
    output_dir="./codet5_finetuned_fastapi",
    per_device_train_batch_size=4,
    num_train_epochs=2,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

trainer.train()

## Section 5: Evaluate on a New Prompt

In [None]:
def generate_code(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)
    output = model.generate(**inputs, max_length=512)
    return tokenizer.decode(output[0], skip_special_tokens=True)

sample_prompt = """Create a FastAPI microservice to return all distinct environments from a devices table.

Table Design:
Table: devices
Columns:
- id: integer, primary key
- device_id: varchar(64)
- hostname: varchar(64)
- application: varchar(32)
- environment: varchar(16)
- os_type: varchar(16)
- os_version: varchar(16)
- tech_stack: varchar(64)
"""

print(generate_code(sample_prompt))

## Section 6: Save Model and Tokenizer

In [None]:
model.save_pretrained("codet5_finetuned_fastapi")
tokenizer.save_pretrained("codet5_finetuned_fastapi")
!zip -r codet5_finetuned_fastapi.zip codet5_finetuned_fastapi
from google.colab import files
files.download("codet5_finetuned_fastapi.zip")

## Section 7: Notes for GitHub + Colab

In [None]:
# Save this notebook as `codet5_fastapi_finetune.ipynb`
# Push to a public GitHub repo and use the following badge in README:
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yourusername/yourrepo/blob/main/codet5_fastapi_finetune.ipynb)