In [1]:
from datasets import load_dataset, load_from_disk
from peft import LoraConfig, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)
import requests
import os
import json

In [8]:
url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
r = requests.get(url)
print("Status:", r.status_code)
with open("simple_squad.json", "wb") as f:
    f.write(r.content)
print("Size:", os.path.getsize("simple_squad.json"))

Status: 200
Size: 30288272


In [11]:
OUTPUT_JSONL_FILENAME = "simple_squad.jsonl"
DOWNLOAD_FILENAME = "simple_squad.json"
processed_count = 0
with open(DOWNLOAD_FILENAME, 'r', encoding='utf-8') as infile, open(OUTPUT_JSONL_FILENAME, 'w', encoding='utf-8') as outfile:
    squad_data = json.load(infile)
    for topic in squad_data['data']:
        for paragraph in topic['paragraphs']:
            for qa in paragraph['qas']:
                question = qa['question'].strip()
                # Ensure there are answers and grab the first one
                if qa['answers']:
                    answer = qa['answers'][0]['text'].strip()

                    # Create the dictionary for the JSONL entry
                    entry = {"question": question, "answer": answer}

                    # Write the entry as a JSON string followed by a newline
                    outfile.write(json.dumps(entry) + '\n')
                    processed_count += 1
print(processed_count)



87599


In [4]:
max_length = 128


# Model loading params
load_in_4bit = False

# LoRA Params
lora_alpha = 16             # How much to weigh LoRA params over pretrained params
lora_dropout = 0.1          # Dropout for LoRA weights to avoid overfitting
lora_r = 16                 # Bottleneck size between A and B matrix for LoRA params
lora_bias = "all"           # "all" or "none" for LoRA bias
model_type = "wizard7"        # falcon or llama or wizard7 or wizard13
dataset_type = "squad"      # "squad" or "reddit" or "reddit_negative"
lora_target_modules = [     # Which modules to apply LoRA to (names of the modules in state_dict)
    "query_key_value",
    "dense",
    "dense_h_to_4h",
    "dense_4h_to_h",
] if model_type == "falcon" else [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj"
]

# Trainer params
output_dir = "outputs_squad"                              # Directory to save the model
optim_type = "adafactor"                            # Optimizer type to train with 
learning_rate = 0.00005                              # Model learning rate
weight_decay = 0.002                                # Model weight decay
per_device_train_batch_size = 8                     # Train batch size on each GPU
per_device_eval_batch_size = 8                      # Eval batch size on each GPU
gradient_accumulation_steps = 2                     # Number of steps before updating model
warmup_steps = 5                                    # Number of warmup steps for learning rate
save_steps = 100                                     # Number of steps before saving model
logging_steps = 25                                  # Number of steps before logging


In [15]:
tokenizer = AutoTokenizer.from_pretrained(
          "TheBloke/wizardLM-7B-HF",
          trust_remote_code=True,
         cache_dir="./models",
          )
tokenizer.pad_token = tokenizer.eos_token

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [16]:
def map_function(example):
        # Get the question and model output
        question = f"#### Human: {example['question'].strip()}"
        output = f"#### Assistant: {example['answer'].strip()}"

        # Encode the question and output
        question_encoded = tokenizer(question)
        output_encoded = tokenizer(output, max_length=max_length-1-len(question_encoded["input_ids"]), truncation=True, padding="max_length")

        # Add on a pad token to the end of the input_ids
        output_encoded["input_ids"] = output_encoded["input_ids"] + [tokenizer.pad_token_id]
        output_encoded["attention_mask"] = output_encoded["attention_mask"] + [0]

        # Combine the input ids
        input_ids = question_encoded["input_ids"] + output_encoded["input_ids"]

        # # The labels are the input ids, but we want to mask the loss for the context and padding
        # labels = [-100]*len(question_encoded["input_ids"]) + [output_encoded["input_ids"][i] if output_encoded["attention_mask"][i] == 1 else -100 for i in range(len(output_encoded["attention_mask"]))]

        # Combine the attention masks. Attention masks are 0
        # where we want to mask and 1 where we want to attend.
        # We want to attend to both context and generated output
        # Also add a 1 for a single padding
        attention_mask = [1]*len(question_encoded["input_ids"]) + [1]*(sum(output_encoded["attention_mask"])+1) + [0]*(len(output_encoded["attention_mask"])-sum(output_encoded["attention_mask"])-1)
        
        # The labels are the input ids, but we want to mask the loss for the context and padding
        labels = [input_ids[i] if attention_mask[i] == 1 else -100 for i in range(len(attention_mask))]
        assert len(labels) == len(attention_mask) and len(attention_mask) == len(input_ids), "Labels is not the correct length"

        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": attention_mask
             }

In [17]:
from datasets import load_dataset, DatasetDict

jsonl_file_path = "simple_squad.jsonl"  
test_split_percentage = 0.2 
random_seed = 42
dataset = load_dataset("json", data_files=jsonl_file_path)
dataset = dataset["train"].map(map_function)
dataset = dataset.shuffle()
train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size
data_train = dataset.select(range(train_size))
data_test = dataset.select(range(train_size, train_size + test_size))

In [2]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16",
        bnb_4bit_use_double_quant=True,
    )
model = AutoModelForCausalLM.from_pretrained(
         "TheBloke/wizardLM-7B-HF", 
        trust_remote_code=True, 
        device_map="auto", 
        quantization_config=bnb_config,
        cache_dir="./models",
    )

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [5]:
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias=lora_bias,
    task_type="CAUSAL_LM",
    inference_mode=False,
    target_modules=lora_target_modules
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 39,976,960 || all params: 6,778,400,768 || trainable%: 0.5898


In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    optim=optim_type,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    do_train=True,
    warmup_steps=warmup_steps,
    save_steps=save_steps,
    logging_steps=logging_steps,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_test,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss


config.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

In [4]:
import peft
import torch
from peft import PeftConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
import shutil



lora_path = "outputs_squad/checkpoint-12200" # Path to the LoRA weights
output_path = "outputs_squad/merged_model"   # Path to output the merged weights
model_type = "wizard7"    

In [6]:
peft_model_id = lora_path
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(
    peft_config.base_model_name_or_path,
    return_dict=True,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="cpu",
    cache_dir="./models"
)
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path, cache_dir="./models")


import os

path = os.listdir("models/models--TheBloke--wizardLM-7B-HF/snapshots")[0]
path = os.path.join("models/models--TheBloke--wizardLM-7B-HF/snapshots", path)
model = PeftModel.from_pretrained(model, peft_model_id)
model.eval()
key_list = [key for key, _ in model.named_modules() if "lora" not in key]
for key in key_list:
    try:
        sub_mod = model.get_submodule(key)
        parent = model.get_submodule(".".join(key.split(".")[:-1]))
    except AttributeError:
        continue
    target_name = key.split(".")[-1]
    if isinstance(sub_mod, peft.tuners.lora.Linear):
        sub_mod.merge()
        bias = sub_mod.bias is not None
        new_module = torch.nn.Linear(sub_mod.in_features, sub_mod.out_features, bias=bias)
        new_module.weight.data = sub_mod.weight
        if bias:
            new_module.bias.data = sub_mod.bias
        model.base_model._replace_module(parent, target_name, new_module, sub_mod)

model = model.base_model.model

# Save the model
model.save_pretrained(output_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
)
import torch

device = "auto"
model_path = "outputs_squad/merged_model"            

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16",
        bnb_4bit_use_double_quant=True,
    )
model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    trust_remote_code=True, 
    device_map=device, 
    torch_dtype=torch.bfloat16,
    # load_in_8bit=True,
    quantization_config=bnb_config if device == "auto" else None,
).eval()
tokenizer = AutoTokenizer.from_pretrained(
          "TheBloke/wizardLM-7B-HF",
          trust_remote_code=True,
         cache_dir="./models",
          )

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
prompt = "#### Human: Who is the president of France?#### Assistant: "

limit = 128


inputs = tokenizer(prompt, return_tensors="pt")
if device != "cpu":
    inputs = inputs.to('cuda')
# del inputs['token_type_ids']
output = model.generate(**inputs, temperature=0.1, do_sample=True, top_p=0.95, top_k=60, max_new_tokens=limit-len(inputs["input_ids"]), pad_token_id=tokenizer.pad_token_id)
output = tokenizer.decode(output[0], skip_special_tokens=True)

# print(output)
print(output.split("#### Assistant:")[1].strip())

FranÃ§ois Hollande
