In [None]:
!pip install transformers datasets accelerate --quiet


In [None]:
from google.colab import files
uploaded = files.upload()
# Expecting a file named fabric_advisor_dataset.jsonl


In [None]:
from datasets import load_dataset

dataset = load_dataset('json', data_files='fabric_advisor_dataset.jsonl', split='train')
dataset = dataset.map(lambda x: {'text': f"Input:\n{x['input']}\nOutput:\n{x['output']}"})

dataset = dataset.train_test_split(test_size=0.1)


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)

tokenized = dataset.map(tokenize, batched=True)


In [None]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained("gpt2")

args = TrainingArguments(
    output_dir="gpt2-fabric-advisor",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    save_strategy="epoch",
    logging_dir="logs",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
)

trainer.train()


In [None]:
model.save_pretrained("gpt2-fabric-advisor")
tokenizer.save_pretrained("gpt2-fabric-advisor")

# Download
!zip -r gpt2-fabric-advisor.zip gpt2-fabric-advisor
files.download("gpt2-fabric-advisor.zip")
