Load keys/tokens from .env and Sign into Huggin face

In [1]:
from huggingface_hub import login
from dotenv import load_dotenv
import os

load_dotenv()

hf_token = os.getenv("HUGGINGFACE_API_TOKEN")
login(token = hf_token)

Set paths to original base model and fine-tuned model

In [2]:
base_model = "/home/arlamb/.llama/checkpoints/Llama3.1-8B-hf"
new_model = "./llama-3-8b-chat-doctor/"

Load tokenizer and original base model. Then, Set up the chat format using the trl library. Finally, load fine-tuned model and merge with the base model using the PEFT library.

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel
import torch
from trl import setup_chat_format

# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model)
base_model_reload = AutoModelForCausalLM.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
)

# Set up chat format
base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)

# Merge adapter with base model
model = PeftModel.from_pretrained(base_model_reload, new_model)
model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [None]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128258, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (n

Model Inference

In [4]:
messages = [{"role": "user", "content": "Hello doctor, I have bad acne. How do I get rid of it?"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

outputs = pipe(prompt, max_new_tokens=120, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs[0]["generated_text"])

<|im_start|>user
Hello doctor, I have bad acne. How do I get rid of it?<|im_end|>
<|im_start|>assistant
Hello. For treating acne, it is very important to follow a proper regimen. It consists of washing your face twice daily with a good cleanser, applying benzoyl peroxide (2.5 to 10 percent) gel or cream, and avoiding oily skin care products. Also, do not pick or squeeze pimples. You can apply a 5 percent benzoyl peroxide gel on the face twice daily for a few weeks. If that does not work, then add a 0.5 percent retinol cream to your regimen. If that also does not work, then a


Save and Push model

In [5]:
model.save_pretrained("llama-3-8b-chat-doctor")
tokenizer.save_pretrained("llama-3-8b-chat-doctor")

('llama-3-8b-chat-doctor/tokenizer_config.json',
 'llama-3-8b-chat-doctor/special_tokens_map.json',
 'llama-3-8b-chat-doctor/tokenizer.json')

In [6]:
model.push_to_hub("llama-3-8b-chat-doctor", use_temp_dir=False)
tokenizer.push_to_hub("llama-3-8b-chat-doctor", use_temp_dir=False)

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]
[A

[A[A


model-00002-of-00004.safetensors:   0%|          | 1.05M/5.00G [00:00<08:08, 10.2MB/s]

[A[A


model-00002-of-00004.safetensors:   0%|          | 2.08M/5.00G [00:00<08:59, 9.27MB/s]


[A[A[A

model-00002-of-00004.safetensors:   0%|          | 3.01M/5.00G [00:00<09:30, 8.76MB/s]


[A[A[A

model-00002-of-00004.safetensors:   0%|          | 4.60M/5.00G [00:00<08:10, 10.2MB/s]

[A[A


model-00002-of-00004.safetensors:   0%|          | 6.08M/5.00G [00:00<07:33, 11.0MB/s]

[A[A


model-00002-of-00004.safetensors:   0%|          | 7.55M/5.00G [00:00<07:12, 11.5MB/s]

[A[A


model-00002-of-00004.safetensors:   0%|          | 9.21M/5.00G [00:00<06:47, 12.2MB/s]

[A[A


model-00002-of-00004.safetensors:   0%|          | 10.9M/5.00G [00:00<06:32, 12.7MB/s]

model-00002-of-00004.safetensors:   0%|          | 12.5M/5.00G [00:01<06:03, 13.7MB/s]

model-00002-of-00004.safetensors:   0%|        

CommitInfo(commit_url='https://huggingface.co/AustianoTrojan/llama-3-8b-chat-doctor/commit/892bad3688a5d4e362028332cd31a6693647cbfb', commit_message='Upload tokenizer', commit_description='', oid='892bad3688a5d4e362028332cd31a6693647cbfb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/AustianoTrojan/llama-3-8b-chat-doctor', endpoint='https://huggingface.co', repo_type='model', repo_id='AustianoTrojan/llama-3-8b-chat-doctor'), pr_revision=None, pr_num=None)