In [1]:
import os
from llama_index.embeddings.huggingface import HuggingFaceEmbedding


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

In [4]:
from llama_index.core import Settings
Settings.embed_model=embed_model

In [None]:
! mkdir -p 'data/paul_graham/'
! curl 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 75042  100 75042    0     0   142k      0 --:--:-- --:--:-- --:--:--  143k
curl: (6) Could not resolve host: data


In [19]:
! pip install llama-index-llms-huggingface

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [6]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex

In [7]:
# load documents
documents = SimpleDirectoryReader("./data/paul_graham/").load_data()

In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index.llms.huggingface import HuggingFaceLLM


In [9]:
# Taken from hf https://huggingface.co/Writer/camel-5b-hf
from llama_index.core import PromptTemplate
chat_prompt = PromptTemplate(
    "<|system|>\nYou are a helpful assistant that answers questions based on the provided context.\n<|user|>\nContext: {context_str}\n\nQuestion: {query_str}\n<|assistant|>\n"
)

In [10]:
import torch

In [11]:
llm = HuggingFaceLLM(
    context_window=2048,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.7, "do_sample": True, "top_k": 50, "top_p": 0.95},  # Updated parameters
    query_wrapper_prompt=chat_prompt,  # Use chat prompt
    tokenizer_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    device_map="auto",
    tokenizer_kwargs={"max_length": 2048},
    model_kwargs={"torch_dtype": torch.bfloat16}  # Changed to bfloat16 like in docs
)

Some parameters are on the meta device because they were offloaded to the disk.


In [12]:
Settings.chunk_size = 512
Settings.llm = llm

In [13]:
index = VectorStoreIndex.from_documents(documents)

In [17]:
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What did the author say about HTML in the essay?")

In [18]:
print(response)

The author mentioned that HTML was a derivative of SGML in the context of the fall of 1992 and the introduction of art school at RISD.
