In [16]:
import json
from torch.utils.data import Dataset

class ChartQALLMDataset(Dataset):
    def __init__(self, jsonl_path, tokenizer, max_len=1024):
        self.samples = []
        self.tokenizer = tokenizer
        self.max_len = max_len

        with open(jsonl_path, "r", encoding="utf-8") as f:
            for line in f:
                self.samples.append(json.loads(line))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        s = self.samples[idx]

        prompt = (
            "You are a ChartQA model. Use the chart description and data table to answer.\n\n"
            f"[DESCRIPTION]\n{s['chart_description']}\n\n"
            f"[DATA TABLE]\n{s['data_points']}\n\n"
            f"[QUESTION]\n{s['instruction']}\n\n"
            "[ANSWER]\n"
        )

        full_text = prompt + s["output"]

        enc = self.tokenizer(
            full_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )

        # Mask prompt tokens so loss is only on the answer
        labels = enc["input_ids"].clone()
        answer_start = len(self.tokenizer(prompt).input_ids)
        labels[:answer_start] = -100

        enc["labels"] = labels
        return {k: v.squeeze(0) for k, v in enc.items()}


In [2]:
!pip install hf-transfer
!export HF_HUB_ENABLE_HF_TRANSFER=1



In [18]:
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForCausalLM

MODEL_NAME = "sshleifer/tiny-gpt2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)


In [19]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})

model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(50258, 2)

In [20]:
from peft import LoraConfig, get_peft_model

lora = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora)




In [8]:
with open("/content/llm_train.jsonl", "w", encoding="utf-8") as f:
  f.write('{"instruction": "What was the highest sales month?", "chart_description": "A line chart showing monthly sales from January to June with a steady upward trend.", "data_points": "Month,Sales Jan,120 Feb,150 Mar,170 Apr,200 May,240 Jun,300", "output": "June had the highest sales with 300 units."}\n{"instruction": "Which country has the largest market share?", "chart_description": "A pie chart showing telecom market shares for four companies.", "data_points": "Country,Market Share China Telecom,53% China Unicom,34.2% China Mobile,6.2% Other,6.6%", "output": "China Telecom has the largest market share at 53%."}\n{"instruction": "How many countries exceed 20 units?", "chart_description": "A bar chart comparing values for nine countries across one date.", "data_points": "Country,Value Brazil,0 Mexico,0 Russia,0 India,0 Indonesia,0 Italy,4 France,0 United States,1 United Kingdom,0", "output": "Only Italy exceeds 20 units? No â€” none exceed 20 units; all values are below 20."}')

In [10]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [21]:
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForCausalLM
import torch

train_dataset = ChartQALLMDataset("/content/llm_train.jsonl", tokenizer)
val_dataset = ChartQALLMDataset("/content/llm_train.jsonl", tokenizer)

training_args = TrainingArguments(
    output_dir="./chart_llm_lora",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=2,
    bf16=torch.cuda.is_available(),
    logging_steps=20,
    save_steps=500,
    eval_steps=200,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

model.save_pretrained("./chart_llm_lora")
tokenizer.save_pretrained("./chart_llm_lora")


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss




('./chart_llm_lora/tokenizer_config.json',
 './chart_llm_lora/special_tokens_map.json',
 './chart_llm_lora/vocab.json',
 './chart_llm_lora/merges.txt',
 './chart_llm_lora/added_tokens.json',
 './chart_llm_lora/tokenizer.json')