In [1]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    HfArgumentParser,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_from_disk, disable_caching
disable_caching()
from dataclasses import dataclass, field
from typing import List, Union

from trl import SFTConfig, SFTTrainer

ModuleNotFoundError: No module named 'trl'

In [9]:
def prepare_dataset_for_training(dataset, tokenizer, seq_len, prompt_file):
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token
    with open(prompt_file, "r") as f:
        prompt = f.read()
    columns = dataset["train"].features.keys()

    def preprocess_function(examples):
        prompts = [
            prompt.format(user_question=q, table_metadata_string=m)
            for q, m in zip(examples["question"], examples["context"])
        ]

        answers = [a if a.endswith(";") else a + ";" for a in examples["answer"]]

        model_inputs = tokenizer(
            prompts,
            truncation=True,
            max_length=seq_len,
            padding="max_length",
        )
        labels = tokenizer(
            text_target=answers,
            truncation=True,
            max_length=seq_len,
            padding="max_length",
        )
        labels["input_ids"] = [
            [-100 if token == tokenizer.pad_token_id else token for token in label]
            for label in labels["input_ids"]
        ]

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_dataset = dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=columns,
    )
    return tokenized_dataset

In [10]:
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
dataset = prepare_dataset_for_training(load_from_disk("./datasets/sql-create-context-split"), tokenizer, seq_len=1024, prompt_file="./prompts/prompt_v2.md")

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [11]:
dataset["val"]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 250
})

In [15]:
dataset["val"]["labels"][0]

[-100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,

In [21]:
import pandas as pd
from datasets import load_dataset
from datasets import Dataset

#Load the dataset from the HuggingFace Hub
rd_ds = load_dataset("xiyuez/red-dot-design-award-product-description")

#Convert to pandas dataframe for convenient processing
rd_df = pd.DataFrame(rd_ds['train'])

#Combine the two attributes into an instruction string
rd_df['instruction'] = 'Create a detailed description for the following product: '+ rd_df['product']+', belonging to category: '+ rd_df['category']

rd_df = rd_df[['instruction', 'description']]

#Get a 5000 sample subset for fine-tuning purposes
rd_df_sample = rd_df.sample(n=5000, random_state=42)

#Define template and format data into the template for supervised fine-tuning
template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:

{}

### Response:\n"""

rd_df_sample['prompt'] = rd_df_sample["instruction"].apply(lambda x: template.format(x))
rd_df_sample.rename(columns={'description': 'response'}, inplace=True)
rd_df_sample['response'] = rd_df_sample['response'] + "\n### End"
rd_df_sample = rd_df_sample[['prompt', 'response']]

rd_df_sample['text'] = rd_df_sample["prompt"] + rd_df_sample["response"]

In [22]:
rd_df_sample

Unnamed: 0,prompt,response,text
18952,Below is an instruction that describes a task....,The CG8565 is a gaming PC offering space for h...,Below is an instruction that describes a task....
12584,Below is an instruction that describes a task....,The iSHOXS BullBar ProX mount can be used to a...,Below is an instruction that describes a task....
5702,Below is an instruction that describes a task....,The S81 Pro focuses on two things: outstanding...,Below is an instruction that describes a task....
20503,Below is an instruction that describes a task....,The CenFlex superfinish machine is designed fo...,Below is an instruction that describes a task....
2480,Below is an instruction that describes a task....,The THALION S gas absorption heat pump uses na...,Below is an instruction that describes a task....
...,...,...,...
268,Below is an instruction that describes a task....,“The MoodPlay can be described as a record pla...,Below is an instruction that describes a task....
518,Below is an instruction that describes a task....,V23 is a switch panel that includes sockets an...,Below is an instruction that describes a task....
8137,Below is an instruction that describes a task....,The Bosch Aqua water purifier collection for u...,Below is an instruction that describes a task....
5508,Below is an instruction that describes a task....,The design concept for these kitchen knives an...,Below is an instruction that describes a task....


In [None]:
@dataclass
class ModelConfig:
    model: str = field(default="codellama/CodeLlama-7b-Instruct-hf")
    dataset: str = field(default="./datasets/sql-create-context-split")
    prompt: str = field(default="./prompts/prompt_v2.md")
    seq_len: int = field(default=2048)
    bits: int = field(default=4)
    bnb_4bit_quant_type: str = field(default="nf4")
    r: int = field(default=16)
    lora_alpha: int = field(default=32)
    lora_dropout: float = field(default=0.1)
    target_modules: List[str] = field(default_factory=lambda: ["q_proj", "v_proj"])
    bias: str = field(default="none")
    init_lora_weights: Union[bool, str] = field(default=True)
    task_type: str = field(default="CAUSAL_LM")

In [4]:
parser = HfArgumentParser((ModelConfig, TrainingArguments))
model_config, training_args = parser.parse_json_file(json_file="./configs/sqlcoder-v0.json")

In [5]:
torch_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
torch_dtype

torch.bfloat16

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_config.model)

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    model_config.model,
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_8bit=model_config.bits == 8,
        load_in_4bit=model_config.bits == 4,
        bnb_4bit_compute_dtype=torch_dtype,
        bnb_4bit_quant_type=model_config.bnb_4bit_quant_type,
    ),
    torch_dtype=torch_dtype,
    trust_remote_code=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)

In [9]:
lora_config = LoraConfig(
    r=model_config.r,
    lora_alpha=model_config.lora_alpha,
    target_modules=model_config.target_modules,
    lora_dropout=model_config.lora_dropout,
    bias=model_config.bias,
    init_lora_weights=model_config.init_lora_weights,
    task_type=model_config.task_type,
)

model = get_peft_model(model, lora_config)

In [10]:
model.print_trainable_parameters()

trainable params: 8,388,608 || all params: 6,746,935,296 || trainable%: 0.1243


In [11]:
dataset = load_from_disk(model_config.dataset)
dataset = prepare_dataset_for_training(dataset, tokenizer, model_config.seq_len, prompt_file="./prompts/prompt_v2.md")

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["val"],
    data_collator=data_collator,
)

In [None]:
trainer.train()
trainer.save_model(training_args.output_dir)