In [13]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import Settings
from llama_index.core.retrievers import VectorIndexRetriever, VectorIndexAutoRetriever
from llama_index.core.query_engine.retriever_query_engine import RetrieverQueryEngine

# from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration

# import chromadb

In [4]:
# HF pointers

### EMBEDDERS ###
EMBED = "BAAI/bge-small-en-v1.5"
# Used in llama index local exam ple
# https://docs.llamaindex.ai/en/stable/getting_started/starter_example_local/


### NON CODE LLMs ###

ZEPHYR = 'HuggingFaceH4/zephyr-7b-beta'
# https://huggingface.co/HuggingFaceH4/zephyr-7b-beta
# bf16, mistral
# Used in the llama index using a huggingface llm guide: 
# https://docs.llamaindex.ai/en/stable/module_guides/models/llms/usage_custom/#example-using-a-huggingface-llm

STABLELM = 'stabilityai/stablelm-tuned-alpha-3b'
# https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b
# float32?, stablelm
# Default model under the hood in llama index if pointing to use HF


### CODE LLMs ###
# HF AI coding leaderboard: https://huggingface.co/spaces/mike-ravkine/can-ai-code-results

MAGICODER = 'ise-uiuc/Magicoder-DS-6.7B'
# https://huggingface.co/ise-uiuc/Magicoder-DS-6.7B
# float32, deepseek
# First <30B param model on the leaderboard

CODEBOOGA = 'LoneStriker/CodeBooga-34B-v0.1-4.0bpw-h6-exl2'
# https://huggingface.co/LoneStriker/CodeBooga-34B-v0.1-4.0bpw-h6-exl2
# dtype unknown,  merge of Phind-CodeLlama-34B-v2 and WizardCoder-Python-34B-V1.0
# First <100B param model on the leaderboard

DEEPSEEK = 'deepseek-ai/deepseek-coder-6.7b-instruct'
# https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct
# bf16, deepseek?
# Base model for e.g. magicoder, and half the size by dtype?



In [18]:
import chromadb
print(chromadb.__version__)
# from llama_index.core import VectorStoreIndex
from chromadb.utils import embedding_functions
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore

cdb_embed_model = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EMBED)
li_embed_model = HuggingFaceEmbedding(model_name=EMBED)

embedding_functions = cdb_embed_model

chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="my_collection", embedding_function=embedding_functions)


# dirs = ['data', 'data_other']

0.3.23


Using embedded DuckDB without persistence: data will be transient


In [17]:
collection.add_documents
# collection.add(
#     documents=["one text", "data/capital_dutch.txt"],
#     ids=["id1", "id2"]
# )
# collection.peek()

AttributeError: 'Collection' object has no attribute 'add_documents'

In [29]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from pathlib import Path

Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

dirs = ['data', 'data_other']
files = [str(file) for dir in dirs for file in Path(dir).glob('**/*.*')]

documents = SimpleDirectoryReader(input_files=files).load_data()
index = VectorStoreIndex.from_documents(documents)


chroma_client = chromadb.EphemeralClient()
chroma_collection = chroma_client.create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# vector_store = ChromaVectorStore.from_documents(documents)

index = VectorStoreIndex(nodes, storage_context=storage_context)



retriever_1 = VectorIndexRetriever(
    index=index,
    similarity_top_k=1,
)
retriever_2 = index.as_retriever(similarity_top_k=2)

# llm = ''

# RetrieverQueryEngine.from_args(retriever_1, llm=llm)

['data/capital_dutch.txt', 'data/more_data/anner_english.txt', 'data_other/superlong_wiki_falcon9.txt']


In [13]:
index._vector_store.

{'_data': SimpleVectorStoreData(embedding_dict={'c97ef46c-4c32-4c67-9f9f-0b800e13ab49': [-0.0072972397319972515, -0.021587787196040154, 0.02787180431187153, -0.06509429216384888, 0.062377095222473145, -0.05582469329237938, -0.054570890963077545, 0.01638600416481495, -0.02473445236682892, -0.03491535037755966, 0.018772520124912262, -0.07770910114049911, -0.07376110553741455, -0.020011719316244125, 0.03185313940048218, 0.01929979771375656, -0.009023652411997318, 0.008343811146914959, 0.0034812057856470346, -0.012100396677851677, -0.002528983633965254, -0.06383613497018814, 0.037887200713157654, -0.023994233459234238, -0.016549447551369667, 0.019509386271238327, 0.037420257925987244, -0.002680876525118947, -0.08526109904050827, -0.16394858062267303, -0.0022120734211057425, -0.08573837578296661, 0.022172067314386368, 0.005322342272847891, 0.038057658821344376, 0.01615142449736595, -0.01171411108225584, 0.021940231323242188, 0.014599014073610306, 0.007439506705850363, 0.0019368540961295366,

In [3]:
embeddings = Settings.embed_model.get_text_embedding(["It is raining cats and dogs here!", "another"])
# list of 384
query = 'What is the capital of Netherland?'
retr_1 = retriever_1.retrieve(query)
retr_2 = retriever_2.retrieve(query)

## Example: Using a HuggingFace LLM
https://docs.llamaindex.ai/en/stable/module_guides/models/llms/usage_custom.html#example-using-a-huggingface-llm

In [1]:
# from https://huggingface.co/HuggingFaceH4/zephyr-7b-beta

import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

zephyr = 'HuggingFaceH4/zephyr-7b-beta'
default = 'stabilityai/stablelm-tuned-alpha-3b'
code = 'deepseek-ai/deepseek-coder-6.7b-instruct'

model_name = default

device = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# pipe = pipeline("text-generation", model=zephyr, torch_dtype=torch.bfloat16, device_map="auto")
# du -sh ~/.cache/huggingface/hub/

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 8/8 [03:18<00:00, 24.85s/it]


In [7]:
type(prompt)

str

In [10]:
messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds in the style of a pirate",
    },
    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
]
prompt = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
# print(prompt)
outputs = model.generate(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


KeyboardInterrupt: 

In [None]:
from llama_index.core import PromptTemplate


# Transform a string into input zephyr-specific input
def completion_to_prompt(completion):
    return f"<|system|>\n</s>\n<|user|>\n{completion}</s>\n<|assistant|>\n"


# Transform a list of chat messages into zephyr-specific input
def messages_to_prompt(messages):
    prompt = ""
    for message in messages:
        if message.role == "system":
            prompt += f"<|system|>\n{message.content}</s>\n"
        elif message.role == "user":
            prompt += f"<|user|>\n{message.content}</s>\n"
        elif message.role == "assistant":
            prompt += f"<|assistant|>\n{message.content}</s>\n"

    # ensure we start with a system prompt, insert blank if needed
    if not prompt.startswith("<|system|>\n"):
        prompt = "<|system|>\n</s>\n" + prompt

    # add final assistant prompt
    prompt = prompt + "<|assistant|>\n"

    return prompt


import torch
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Settings

Settings.llm = HuggingFaceLLM(
    model_name="HuggingFaceH4/zephyr-7b-beta",
    tokenizer_name="HuggingFaceH4/zephyr-7b-beta",
    context_window=3900,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    device_map="auto",
)

### Huggingface only

In [None]:
# https://github.com/huggingface/transformers/blob/main/examples/research_projects/rag/use_own_knowledge_dataset.py

rag_model_name = 'facebook/rag-sequence-nq' # or 'facebook/rag-token-nq
embedding_model_name = 'facebook/dpr-ctx_encoder-single-nq-base' # or 'facebook/dpr-ctx_encoder-multiset-base'

tokenizer = RagTokenizer.from_pretrained(rag_model_name)
retriever = RagRetriever.from_pretrained(rag_model_name, index_name="custom", indexed_dataset=dataset)
model = RagSequenceForGeneration.from_pretrained(rag_model_name, retriever=retriever)


input_ids = tokenizer.question_encoder(query, return_tensors="pt")["input_ids"]
generated = model.generate(input_ids)
generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)[0]

In [4]:
# https://medium.com/@pankaj_pandey/introduction-to-retrieval-augmented-generation-rag-9209bf8a076d

rag_model_name = 'facebook/rag-token-base'
# tokenizer = RagTokenizer.from_pretrained(rag_model_name)
retriever = RagRetriever.from_pretrained(rag_model_name, use_dummy_dataset=True)
# model = RagSequenceForGeneration.from_pretrained(rag_model_name)

# # Retrieve relevant information
# input_text = "What is retrieval-augmented generation?"
# retrieved_info = retriever.retrieve(input_text)

# # Generate response using RAG
# generated_response = model.generate(**retrieved_info)
# print(tokenizer.decode(generated_response[0], skip_special_tokens=True))

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

: 

In [6]:



# OTHER
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt") 
retriever = RagRetriever.from_pretrained("facebook/rag-token-base")

384