In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git 
!pip install -q -U git+https://github.com/huggingface/peft.git
#pip install -q -U git+https://github.com/huggingface/accelerate.git
#current version of Accelerate on GitHub breaks QLoRa
#Using standard pip instead
!pip install -q -U accelerate
!pip install -q -U datasets

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [4]:
#model_name = "Birchlabs/mosaicml-mpt-7b-chat-qlora"
model_name = "huggyllama/llama-7b"

#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [5]:
!pip install einops

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting einops
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.6.1


In [6]:
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quant_config, device_map={"":0})

Downloading (…)lve/main/config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /home/ec2-user/anaconda3/envs/pytorch_p310/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [7]:
model.gradient_checkpointing_enable()

In [8]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=["q_proj", "v_proj"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

In [9]:
from datasets import load_dataset
# Specify the path to your file
path_to_file = "final-space.jsonl"

# Load the dataset
data = load_dataset('json', data_files=path_to_file)
data = data.map(lambda samples: tokenizer(samples["text"]), batched=True)

Downloading and preparing dataset json/default to /home/ec2-user/.cache/huggingface/datasets/json/default-35d4744e250cc41e/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/ec2-user/.cache/huggingface/datasets/json/default-35d4744e250cc41e/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

In [25]:
tokenizer.pad_token = tokenizer.eos_token

In [10]:
import transformers

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,2.7867
2,2.7817
3,2.3832
4,2.4016
5,2.2811
6,2.1657
7,2.044
8,2.2148
9,2.0236
10,1.7503


TrainOutput(global_step=10, training_loss=2.283282923698425, metrics={'train_runtime': 142.5303, 'train_samples_per_second': 0.561, 'train_steps_per_second': 0.07, 'total_flos': 102380047220736.0, 'train_loss': 2.283282923698425, 'epoch': 2.86})

In [12]:
model.push_to_hub("NeoXAdapter")

adapter_model.bin:   0%|          | 0.00/34.6M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/FarziBuilder/NeoXAdapter/commit/dd9b10cca7fa090708f515ff7cfc8bf4dd879c26', commit_message='Upload model', commit_description='', oid='dd9b10cca7fa090708f515ff7cfc8bf4dd879c26', pr_url=None, pr_revision=None, pr_num=None)

In [11]:
model.save_pretrained("myAdapter")

In [13]:
model_save = trainer.model

In [15]:
lora_config = LoraConfig.from_pretrained('myAdapter')
model = get_peft_model(model, lora_config)

In [16]:
model.save_pretrained("perhapsModel")

In [2]:
from peft import PeftConfig, PeftModel


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /home/ec2-user/anaconda3/envs/pytorch_p310/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


In [5]:
folder_name = "myAdapter"
peft_config = PeftConfig.from_pretrained(folder_name)

# Provide the offload_dir path
offload_dir = "directory"  # replace with your actual directory path

# Check if the directory exists, if not create it
import os
if not os.path.exists(offload_dir):
    os.makedirs(offload_dir)

model = AutoModelForCausalLM.from_pretrained(
            model_name, torch_dtype=torch.float16, load_in_8bit=False, device_map="auto", trust_remote_code=True)
#model = AutoModelForCausalLM.from_pretrained(
#    model_name, device_map="auto", offload_folder="offload", offload_state_dict = True, torch_dtype=torch.float16
#)
#check_model = AutoModelForCausalLM.from_pretrained(model_name)
#check_model = AutoModelForCausalLM.from_pretrained(
#    peft_config.base_model_name_or_path,
#    return_dict=True,
#    torch_dtype=torch.float16,
#    low_cpu_mem_usage=True,
#    trust_remote_code = True
#)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
folder_name = "myAdapter"
check_model = PeftModel.from_pretrained(model, folder_name)
check_model.eval()

print("PEFT model loaded successfully.")

# Merge LoRA and base model and save
print("Merging LoRA and base model...")
merged_model = check_model.merge_and_unload()

PEFT model loaded successfully.
Merging LoRA and base model...


In [21]:
merged_model.save_pretrained("myLLaMa")

In [22]:
merged_model.push_to_hub("fastInferencetry9")

pytorch_model.bin:   0%|          | 0.00/667M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/FarziBuilder/fastInferencetry9/commit/a4b37544e4f7cce62eed452a5a1abacc53265d78', commit_message='Upload LlamaForCausalLM', commit_description='', oid='a4b37544e4f7cce62eed452a5a1abacc53265d78', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
merged_model.save_pretrained("savedModel")

In [27]:
tokenizer.push_to_hub("fastInferencetry9")

CommitInfo(commit_url='https://huggingface.co/FarziBuilder/fastInferencetry9/commit/519692aa33ae821b8a70e01abc2de9b168aaa5fc', commit_message='Upload tokenizer', commit_description='', oid='519692aa33ae821b8a70e01abc2de9b168aaa5fc', pr_url=None, pr_revision=None, pr_num=None)

In [8]:
text = "Who is the PM of UK?(Answer:Rishi Sunak is the Prime Minister of UK)"
device = "cuda:0"

tokenizer.pad_token = tokenizer.eos_token

# Ensure 'attention_mask' and 'pad_token' are returned.
inputs = tokenizer(text, return_tensors="pt", padding='longest', truncation=True).to(device)

# Explicitly set the pad_token_id


outputs = merged_model.generate(input_ids=inputs['input_ids'], 
                         attention_mask=inputs['attention_mask'], 
                         pad_token_id=tokenizer.eos_token_id, 
                         max_new_tokens=100)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Who is the PM of UK?(Answer:Rishi Sunak is the Prime Minister of UK)
Who is the PM of UK?
Rishi Sunak is the Prime Minister of UK.
Rishi Sunak is the 30th and current Prime Minister of the United Kingdom. He is the Member of Parliament for Richmond (Yorks) and the Chief Secretary to the Treasury.
Rishi Sunak is the 30th and current Prime Minister of the United Kingdom. He is the Member of Parliament for Richmond (Yorks) and the Chief Secretary


In [9]:
merged_model.push_to_hub("fastInferencetry10")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/FarziBuilder/fastInferencetry10/commit/43c4b1b045f64bfe0195131297cc90168b6b5489', commit_message='Upload LlamaForCausalLM', commit_description='', oid='43c4b1b045f64bfe0195131297cc90168b6b5489', pr_url=None, pr_revision=None, pr_num=None)

In [10]:
tokenizer.push_to_hub("fastInferencetry10")

CommitInfo(commit_url='https://huggingface.co/FarziBuilder/fastInferencetry10/commit/2459ab079efe3725cd1032cb695f5020cbd50fb7', commit_message='Upload tokenizer', commit_description='', oid='2459ab079efe3725cd1032cb695f5020cbd50fb7', pr_url=None, pr_revision=None, pr_num=None)