In [3]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


# Prepare the data

In [4]:
from dotenv import load_dotenv
import os

# Loading the .env file
load_dotenv()
ACCESS_TOKEN = os.getenv("GITHUB_TOKEN")

* By default, pull requests are considered issues as well, here we chose to exclude them from data by setting include_prs=False
* Setting state = "all" means we will load both open and closed issues.

In [5]:
from langchain.document_loaders import GitHubIssuesLoader

loader = GitHubIssuesLoader(repo="huggingface/peft", access_token=ACCESS_TOKEN, include_prs=False, state="all")
docs = loader.load()

* Content of individual GitHub issues may be longer than what an embedding model can take as input.
* If we want to embed all of the available content, we need to chunk the documents into   appropriately sized pieces.
* The recommended splitter for generic text is the RecursiveCharacterTextSplitter

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=30)
chunked_docs = splitter.split_documents(docs)

# Create the embeddings + retriever

In [7]:
from huggingface_hub import login
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)  # Authenticate with Hugging Face hub

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [8]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": device}
)
db = FAISS.from_documents(chunked_docs, embeddings)

* search_type="similarity" means we want to perform similarity search between the query and documents
* search_kwargs={'k': 4} instructs the retriever to return top 4 results.

In [9]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

# Load quantized model

In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "HuggingFaceH4/zephyr-7b-beta"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

# Setup the LLM chain

In [11]:
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser

In [20]:

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,         
    return_full_text=True,              # return_full_text=False,
    max_new_tokens=400
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

Device set to use cuda


In [21]:
prompt_template = """
<|system|>
Answer the question based on your knowledge. Use the following context to help:

{context}

</s>
<|user|>
{question}
</s>
<|assistant|>

 """

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

In [22]:
llm_chain = prompt | llm | StrOutputParser()

In [23]:
from langchain_core.runnables import RunnablePassthrough

retriever = db.as_retriever()
rag_chain = {"context": retriever, "question": RunnablePassthrough()} | llm_chain

In [24]:
question = "How do you combine multiple adapters?"

In [25]:
print(llm_chain.invoke({"context": "", "question": question}))


<|system|>
Answer the question based on your knowledge. Use the following context to help:



</s>
<|user|>
How do you combine multiple adapters?
</s>
<|assistant|>

  To combine multiple adapters, you need to ensure that they are compatible with each other and with the devices you want to connect. Here's how you can do it:

1. Determine which adapter(s) you need: Consider the types of connections you need to make. For example, if you want to connect a USB device to an Ethernet network, you'll need a USB-to-Ethernet adapter. If you want to connect a VGA monitor to an HDMI input, you'll need a VGA-to-HDMI adapter.

2. Check compatibility: Make sure that the adapters you choose are compatible with each other and with the devices you want to connect. For example, if you have a USB-to-Ethernet adapter and a USB hub, you may need to check whether the hub is compatible with the adapter.

3. Connect the adapters: Plug one end of the first adapter into the device you want to connect (such as 

In [26]:
print(rag_chain.invoke(question))


<|system|>
Answer the question based on your knowledge. Use the following context to help:

[Document(id='6069499a-eaa5-498c-9315-9f07ec95144e', metadata={'url': 'https://github.com/huggingface/peft/issues/1497', 'title': 'Cannot load adapters from Peft.from_pretrained', 'creator': 'dame-cell', 'created_at': '2024-02-21T13:37:31Z', 'comments': 0, 'state': 'closed', 'labels': [], 'assignee': None, 'milestone': None, 'locked': False, 'number': 1497, 'is_pull_request': False}, page_content='### Expected behavior\r\n\r\nI think I would want the adapters to be downloaded and then I would be able to merge my adapters into one'), Document(id='f303020f-944e-495f-a8ed-ca88e21a3874', metadata={'url': 'https://github.com/huggingface/peft/issues/1802', 'title': 'Issues when switching between multiple adapters LoRAs ', 'creator': 'JhonDan1999', 'created_at': '2024-05-26T19:18:13Z', 'comments': 8, 'state': 'closed', 'labels': [], 'assignee': None, 'milestone': None, 'locked': False, 'number': 1802,