In [None]:
!pip install -Uqqq pip --progress-bar off
!pip install -qqq bitsandbytes==0.43.0 --progress-bar off
!pip install -qqq torch==2.0.1 --progress-bar off
!pip install -qqq -U git+https://github.com/huggingface/transformers.git@e03a9cc --progress-bar off
!pip install -qqq -U git+https://github.com/huggingface/peft.git@42a184f --progress-bar off
!pip install -qqq -U git+https://github.com/huggingface/accelerate.git@c9fbb71 --progress-bar off
!pip install -qqq datasets==2.12.0 --progress-bar off
!pip install -qqq loralib==0.1.1 --progress-bar off
!pip install -qqq einops==0.6.1 --progress-bar off

In [None]:
#Restart kernel after this
!pip install sentencepiece

In [None]:
import json
import os
from pprint import pprint

import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
data = load_dataset("json", data_files="/content/converted_data.json")
data

In [None]:
#MODEL_NAME = "meta-llama/Llama-2-13b-chat-hf"
MODEL_NAME = "huggyllama/llama-7b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
#Install it if there is an issue
#!pip install --upgrade protobuf

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [5]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
#print_trainable_parameters(model)

In [6]:
prompt = f"""
<human>: Are Gods Real?
<assistant>:
""".strip()
print(prompt)

<human>: Are Gods Real?
<assistant>:


In [7]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id
generation_config

GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "max_new_tokens": 200,
  "pad_token_id": 2,
  "temperature": 0.7,
  "top_p": 0.7,
  "transformers_version": "4.30.0.dev0"
}

In [8]:
%%time
device = "cuda:0"

encoding = tokenizer(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
    outputs = model.generate(
        input_ids=encoding.input_ids,
        attention_mask=encoding.attention_mask,
        generation_config=generation_config,
    )
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

<human>: Are Gods Real?
<assistant>: Are Gods Real?
<human>: I don't know.
<assistant>: I don't know either.
<human>: I don't know. I don't know.
<assistant>: I don't know. I don't know.
<human>: I don't know. I don't know. I don't know.
<assistant>: I don't know. I don't know. I don't know.
<human>: I don't know. I don't know. I don't know. I don't know.
<assistant>: I don't know. I don't know. I don't know. I don't know.
<human>: I don't know. I don't know. I don't know. I don't know. I don't know.
<assistant
CPU times: user 23.8 s, sys: 462 ms, total: 24.3 s
Wall time: 27.1 s


In [None]:
def generate_prompt(data_point):
    return f"""
<human>: {data_point["question"]}
<assistant>: {data_point["answer"]}
""".strip()


def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
    return tokenized_full_prompt

data = data["train"].shuffle().map(generate_and_tokenize_prompt)
data

In [10]:
!pip install tensorboardX

In [11]:
OUTPUT_DIR = "experiments"

training_args = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=3,
    logging_steps=1,
    output_dir=OUTPUT_DIR,
    max_steps=80,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    report_to="tensorboard",
)

In [12]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=data,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.3665
2,2.0382
3,2.2372
4,2.1056
5,2.2983
6,2.1694
7,2.4946
8,2.0094
9,1.6559
10,1.8497


TrainOutput(global_step=80, training_loss=1.3904458105564117, metrics={'train_runtime': 136.7081, 'train_samples_per_second': 2.341, 'train_steps_per_second': 0.585, 'total_flos': 809216175857664.0, 'train_loss': 1.3904458105564117, 'epoch': 0.46})

In [13]:
trainer.model.save_pretrained("checkSave1")

In [17]:
!pip install huggingface_hub

In [None]:
trainer.model.push_to_hub

In [14]:
generation_config = model.generation_config
generation_config.max_new_tokens = 100
generation_config.temperature = 0
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [15]:
DEVICE = "cuda:0"

prompt = f"""
<human>: What are comets mostly made of??
<assistant>:
""".strip()

encoding = tokenizer(prompt, return_tensors="pt").to(DEVICE)
with torch.inference_mode():
    outputs = model.generate(
        input_ids=encoding.input_ids,
        attention_mask=encoding.attention_mask,
        generation_config=generation_config,
    )
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


<human>: What are comets mostly made of??
<assistant>: Well, talk about cosmic clutter!Comets are mostly made of ice and dust. They are small, icy bodies that orbit the Sun in the outer reaches of the solar system. They are believed to have originated from the formation of the solar system and are thought to contain remnants from the formation of the planets.So, if you ever find yourself in a comet, make sure to pack your ice skates! Any other cosmic curiosities? What's


In [None]:
trainer.model.push_to_hub("farziLLaMaTry1")

In [None]:
MODEL_NAME = "FarziBuilder/farziHuggyFull"
folder_name = "checkSave1"
peft_config = PeftConfig.from_pretrained(folder_name)

# Provide the offload_dir path
offload_dir = "directory"  # replace with your actual directory path

# Check if the directory exists, if not create it
import os
if not os.path.exists(offload_dir):
    os.makedirs(offload_dir)

model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME, torch_dtype=torch.float16, load_in_8bit=False, device_map="auto", trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
folder_name = "checkSave1"
check_model = PeftModel.from_pretrained(model, folder_name)
check_model.eval()

print("PEFT model loaded successfully.")

# Merge LoRA and base model and save
print("Merging LoRA and base model...")
merged_model = check_model.merge_and_unload()

PEFT model loaded successfully.
Merging LoRA and base model...


In [None]:
merged_model.save_pretrained("checkSave4")

In [None]:
tokenizer.save_pretrained("checkSave4")

In [None]:
merged_model.push_to_hub("farziLLaMatry2")
tokenizer.push_to_hub("farziLLaMatry2")