In [None]:
#!pip install datasets
#!pip install bitsandbytes
#!pip install peft
#!pip install trl

In [None]:
import torch

from pprint import pprint

from datasets import Dataset
from datasets import load_dataset

from transformers import AutoTokenizer
from transformers import TFAutoModelForCausalLM
from transformers import AutoModelForCausalLM
from transformers import BitsAndBytesConfig
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments
from transformers import Trainer

#from trl import SFTConfig, SFTTrainer
from peft import LoraConfig

import json
import psutil

In [None]:
import os

In [None]:
from huggingface_hub import notebook_login

In [None]:
model_id = "meta-llama/Meta-Llama-3-8B"

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def print_memory_used_by_process():
  process_id = os.getpid()
  process_meta = psutil.Process(process_id)
  mem_used = process_meta.memory_info().rss/(1024*1024*1024)
  print('{} GB'.format(mem_used))

In [None]:
print_memory_used_by_process()

1.7413215637207031 GB


In [None]:
osho_dataset = load_dataset('DhruvDancingBuddha/osho_discourses')

Downloading readme:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/41.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1965 [00:00<?, ? examples/s]

In [None]:
osho_dataset

DatasetDict({
    train: Dataset({
        features: ['char_url', 'topic_name', 'topic_lesson_name', 'topic_lesson_url', 'all_txt'],
        num_rows: 1965
    })
})

In [None]:
def tokenizer_osho(examples):
  result = tokenizer(examples['all_txt'])

#  if tokenizer.is_fast:
#     result["word_ids"] = [result.word_ids(i) for i in range(len(result['input_ids']))]

  return result

In [None]:
osho_dataset = osho_dataset['train'].map(tokenizer_osho, batched=True, remove_columns=['all_txt', 'char_url', 'topic_name', 'topic_lesson_name', 'topic_lesson_url'])

Map:   0%|          | 0/1965 [00:00<?, ? examples/s]

In [None]:
osho_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 1965
})

In [None]:
print_memory_used_by_process()

2.7938880920410156 GB


In [None]:
def chunk_text(examples):
  concate_examples = {k:sum(examples[k], []) for k in examples.keys()}
  chunk_size = 128

  total_len = len(concate_examples[list(examples.keys())[0]])

  total_len = (total_len // chunk_size) * chunk_size

  results = {
  k:[t[i:i + chunk_size] for i in range(0, total_len, chunk_size)]
    for k, t in concate_examples.items()
  }

  results["labels"] = results["input_ids"].copy()

  return results

In [None]:
osho_dataset = osho_dataset.map(chunk_text, batched=True)

Map:   0%|          | 0/1965 [00:00<?, ? examples/s]

In [None]:
osho_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 133200
})

In [None]:
print_memory_used_by_process()

2.8693885803222656 GB


In [None]:
divide_by = 1
total_len = len(osho_dataset)/divide_by
train_len = int(0.9 * total_len)
test_len  = int(total_len - train_len)

print(f'Total Length is {total_len}\n\nTrain Len is    {train_len}\n\nTest Len is     {test_len}')

Total Length is 133200.0

Train Len is    119880

Test Len is     13320


In [None]:
osho_dataset = osho_dataset.train_test_split(train_size=train_len,test_size=test_len, seed=42)

In [None]:
osho_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 119880
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 13320
    })
})

In [None]:
print_memory_used_by_process()

2.8685989379882812 GB


In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
peft_config_1 = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM"
)

In [None]:
model.add_adapter(peft_config_1)

In [None]:
training_args = TrainingArguments(
      output_dir="./first_one",
      eval_strategy="epoch",
      learning_rate=2e-5,
      weight_decay=0.01,
      num_train_epochs=2,
      per_device_train_batch_size=16
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=osho_dataset["train"],
    eval_dataset=osho_dataset["test"],
    data_collator=data_collator,
)

In [None]:
trainer.train()

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Epoch,Training Loss,Validation Loss
1,2.4556,2.449855
2,2.4421,2.442821




TrainOutput(global_step=14986, training_loss=2.4601961668129837, metrics={'train_runtime': 8345.734, 'train_samples_per_second': 28.728, 'train_steps_per_second': 1.796, 'total_flos': 1.3869444944520806e+18, 'train_loss': 2.4601961668129837, 'epoch': 2.0})

In [None]:
trainer.save_model()



In [None]:
from peft import AutoPeftModelForCausalLM

In [None]:
new_model = AutoPeftModelForCausalLM.from_pretrained(
    'first_one',
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
merged_model = new_model.merge_and_unload()

In [None]:
merged_model.save_pretrained("metallama3-8b-tuned-merged", safe_serialization=True)
tokenizer.save_pretrained("metallama3-8b-tuned-merged")

('metallama3-8b-tuned-merged/tokenizer_config.json',
 'metallama3-8b-tuned-merged/special_tokens_map.json',
 'metallama3-8b-tuned-merged/tokenizer.json')

In [None]:
prompt = "Catholic churches and pope together have taught us to be "
prompt = 'saints, pandits and mullahs have given us'

In [None]:
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
outputs = merged_model.generate(input_ids=input_ids,
                         max_new_tokens=200,
                        #  do_sample=True,
                        #  top_p=0.9,
                         temperature=0.6)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [None]:
result = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
result

'saints, pandits and mullahs have given us the message that we are separate from existence. You are not a part of it. You are not one with it. You are a stranger, you are a guest here. You are not the host. You are here, but you are here on sufferance. You can be thrown out any moment, you can be kicked out any moment. Existence is not your home, you are just a guest here. You are a stranger here, and you are always afraid that you can be thrown out any moment, that you can be kicked out any moment. This is the basic fear. And because of this basic fear, you are always trying to make yourself secure, you are always trying to make a shelter for yourself. You are always trying to hide, you are always trying to escape from existence. You are always trying to find a corner where you can be safe and secure. You are always trying to make a fortress around yourself, and you are always afraid that existence may destroy it any moment'

In [None]:
model_create_id = 'DhruvDancingBuddha/osho_discourses_llama3_8b_peft'

In [None]:
merged_model.push_to_hub(model_create_id)
tokenizer.push_to_hub(model_create_id)

README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/DhruvDancingBuddha/osho_discourses_llama3_8b_peft/commit/5a65a0be649a132a52bcfcae57c9c2b63ce21780', commit_message='Upload tokenizer', commit_description='', oid='5a65a0be649a132a52bcfcae57c9c2b63ce21780', pr_url=None, pr_revision=None, pr_num=None)