# Requirements

In [None]:
!pip3 install -U bitsandbytes transformers

In [None]:
!pip3 install chromadb langchain langchain-community langchain-chroma langchain-huggingface

# Load documents

In [None]:
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:

URLS = [
    "https://huggingface.co/docs/peft/tutorial/peft_model_config",
    "https://huggingface.co/docs/peft/tutorial/peft_integrations"
]


def load_all_docs():
    loader = WebBaseLoader(URLS)
    documents = loader.load()
    return documents


def split_text(docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(docs)
    print(f"Total {len(docs)} docs into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

## Define embeddings & create vectorbase

In [None]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

In [None]:
model_name = "all-MiniLM-L6-v2"
model_kwargs = {'device': 'cuda:0'}
encode_kwargs = {'normalize_embeddings': False}
embedding_function = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
DB_PATH = "chroma_db"

def save_to_db(chunks):
    db = Chroma.from_documents(chunks, embedding_function, persist_directory=DB_PATH)
    print(f"Saved total of {len(chunks)} chunks to {DB_PATH}.")


def chunk_and_persist_data():
    documents = load_all_docs()
    chunks = split_text(documents)
    save_to_db(chunks)

chunk_and_persist_data()

Total 2 docs into 21 chunks.
PEFT integrations








Hugging Face







					Models

					Datasets

					Spaces

					Posts

					Docs

					Enterprise

Pricing
			






Log In
				
Sign Up
					



PEFT documentation
			
PEFT integrations



PEFT

üè° View all docsAWS Trainium & InferentiaAccelerateAmazon SageMakerArgillaAutoTrainBitsandbytesChat UICompetitionsDataset viewerDatasetsDiffusersDistilabelEvaluateGradioHubHub Python LibraryHugging Face Generative AI Services (HUGS)Huggingface.jsInference API (serverless)Inference Endpoints (dedicated)LeaderboardsLightevalOptimumPEFTSafetensorsSentence TransformersTRLTasksText Embeddings InferenceText Generation InferenceTokenizersTransformersTransformers.jssmolagentstimm

Search documentation


mainv0.14.0v0.13.0v0.12.0v0.11.0v0.10.0v0.9.0v0.8.2v0.7.1v0.6.2
EN








Get started


ü§ó PEFT
Quicktour
Installation


Tutorial


Configurations and models
Integrations


PEFT method guides


Prompt-based methods
LoRA methods
IA3
{'source'

# Play withe the embeddings

In [None]:
from scipy.spatial.distance import cosine

In [None]:
embedding1 = embedding_function.embed_query("computer")
embedding2 = embedding_function.embed_query("screen")
embedding3 = embedding_function.embed_query("Alan Turing")

In [None]:
result1 = 1-cosine(embedding1, embedding2)
result2 = 1-cosine(embedding1, embedding3)
print(result1, result2)

0.48443418498334656 0.39471319246750625


# Query the database

In [None]:
query = "How to create a PeftModel?"
db = Chroma(persist_directory=DB_PATH, embedding_function=embedding_function)

In [None]:
results = db.similarity_search_with_relevance_scores(query, k=3)
all_context = "\n---\n".join([doc.page_content for doc, _score in results])
print(all_context)


lora_config = LoraConfig(
    r=16,
    target_modules=["q_proj", "v_proj"],
    task_type=TaskType.CAUSAL_LM,
    lora_alpha=32,
    lora_dropout=0.05
)   PEFT models With a PEFT configuration in hand, you can now apply it to any pretrained model to create a PeftModel. Choose from any of the state-of-the-art models from the Transformers library, a custom model, and even new and unsupported transformer architectures. For this tutorial, load a base facebook/opt-350m model to finetune.   Copied from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m") Use the get_peft_model() function to create a PeftModel from the base facebook/opt-350m model and the lora_config you created earlier.   Copied from peft import get_peft_model
---
lora_model = AutoPeftModelForCausalLM.from_pretrained("ybelkada/opt-350m-lora") Take a look at the AutoPeftModel API reference to learn more about the AutoPeftModel classes.  Next steps With the appropriate Pe

# Prepare LLaMA

In [None]:
PROMPT_TEMPLATE = """Using the following context, answer the question. Be concise and accurate.

Context:
{context}

Question:
{question}

Answer:"""

prompt = PROMPT_TEMPLATE.format(context=all_context, question=query)

In [None]:
from langchain_huggingface.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
      model_name,
      device_map="auto",
      load_in_4bit=True,
      torch_dtype=torch.float16,
      trust_remote_code=True
)

    # Create the pipeline
text_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=500
)

hf_pipe = HuggingFacePipeline(pipeline=text_pipe)

In [None]:
response = hf_pipe.invoke(prompt)
print(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Using the following context, answer the question. Be concise and accurate.

Context:
lora_config = LoraConfig(
    r=16,
    target_modules=["q_proj", "v_proj"],
    task_type=TaskType.CAUSAL_LM,
    lora_alpha=32,
    lora_dropout=0.05
)   PEFT models With a PEFT configuration in hand, you can now apply it to any pretrained model to create a PeftModel. Choose from any of the state-of-the-art models from the Transformers library, a custom model, and even new and unsupported transformer architectures. For this tutorial, load a base facebook/opt-350m model to finetune.   Copied from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m") Use the get_peft_model() function to create a PeftModel from the base facebook/opt-350m model and the lora_config you created earlier.   Copied from peft import get_peft_model
---
lora_model = AutoPeftModelForCausalLM.from_pretrained("ybelkada/opt-350m-lora") Take a look at the AutoPeftModel API referen

In [None]:
sources = [doc.metadata.get("source", None) for doc, _score in results]
print(sources)

['https://huggingface.co/docs/peft/tutorial/peft_model_config', 'https://huggingface.co/docs/peft/tutorial/peft_model_config', 'https://huggingface.co/docs/peft/tutorial/peft_model_config']


#