# HuggingFace Token Authentication

In [6]:
# Step 0:

import os
from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv()

# Retrieve the Hugging Face token from the environment variables
HF_TOKEN = os.getenv("HF_TOKEN")

# Use the token for login or other operations
def login(token):
    if token:
        print(f"Logging in with available token")
    else:
        print("No token provided.")

login(token=HF_TOKEN)

Logging in with available token


# Initialize LLM - HuggingFace 

In [7]:
# Step 1: Initialize LLM Model

from llama_index.llms.huggingface import HuggingFaceInferenceAPI

def initialize_llm(model_name, token):
    """Initialize the Hugging Face Inference API with the specified model and token."""
    if not token:
        raise ValueError("API token must be provided.")
    
    try:
        llm = HuggingFaceInferenceAPI(
            model_name=model_name,
            token=token
        )
        print(f"Successfully initialized LLM with model: {model_name}")
        return llm
    except Exception as e:
        raise RuntimeError(f"Failed to initialize LLM: {e}")

# Specify the model name
MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"

# Initialize the LLM
try:
    llm_instance = initialize_llm(MODEL_NAME, HF_TOKEN) # It will be needed in subsequent cells
    # print(llm_instance)
except Exception as e:
    print(e)

Successfully initialized LLM with model: mistralai/Mixtral-8x7B-Instruct-v0.1


  llm = HuggingFaceInferenceAPI(


# Initialize Embedding Model - HuggingFace

In [9]:
# Step 2: Initialize Embedding model if required

from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import time


def initialize_embedding_model(embed_model_name):
    """Initialize the HuggingFaceEmbedding model."""
    try:
        embed_model = HuggingFaceEmbedding(model_name=embed_model_name)
        Settings.embed_model = embed_model  # Set it globally for use in LlamaIndex
        print(f"Embedding model '{embed_model}' initialized successfully.")
        return embed_model
    except Exception as e:
        raise RuntimeError(f"Failed to initialize embedding model: {e}")
    

# Specify the EMBEDDING model name
EMBED_MODEL_NAME = "BAAI/bge-small-en-v1.5"

# Initialize the LLM
try:
    embedding_model = initialize_embedding_model(EMBED_MODEL_NAME) # It will be needed in subsequent cells
except Exception as e:
    print(e)

Embedding model 'model_name='BAAI/bge-small-en-v1.5' embed_batch_size=10 callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x000001EB50871510> num_workers=None max_length=512 normalize=True query_instruction=None text_instruction=None cache_folder=None' initialized successfully.


# Downloading and renaming PDF file from a path

In [11]:
# Step 3: Downloading and Renaming a PDF file
import requests

def download_pdf(pdf_url, download_dir, new_filename):
    """Downloads a PDF file from the specified URL and saves it to the given directory with the new filename."""
    # Create the directory if it doesn't exist
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    # Downloading using requests
    response = requests.get(pdf_url)
    if response.status_code == 200:
        with open(os.path.join(download_dir, new_filename), 'wb') as f:
            f.write(response.content)
        print(f"PDF downloaded and saved as {new_filename} in {download_dir}.")
    else:
        print(f"Failed to download PDF: {response.status_code}")

# Specify the URL of the PDF file you want to download
pdf_url = "https://arxiv.org/pdf/1706.03762"

# Specify the directory where you want to save the PDF file
download_dir = "../artifacts"

# Specify the desired new name for the downloaded PDF file
new_filename = "attention-is-all-you-need.pdf"

# Call the function to download the PDF within a try-except block
try:
    download_pdf(pdf_url, download_dir, new_filename)
except Exception as e:
    print(f"An error occurred: {e}")


PDF downloaded and saved as attention-is-all-you-need.pdf in ../artifacts.


# Loading a PDF file

In [12]:
# Step 4: Load PDF File

from llama_index.core import SimpleDirectoryReader

def load_documents_from_directory(input_files):
    """Load documents from the specified directory using SimpleDirectoryReader."""
    try:
        reader = SimpleDirectoryReader(input_files=input_files)
        documents = reader.load_data()
        print(f"Successfully loaded {len(documents)} document(s).")
        return documents
    except FileNotFoundError as e:
        raise RuntimeError(f"File not found: {e}")
    except Exception as e:
        raise RuntimeError(f"An error occurred while loading documents: {e}")

# Specify the path to the text files
input_files = ["../artifacts/attention-is-all-you-need.pdf"]

# Load documents
try:
    documents = load_documents_from_directory(input_files)    # It will be needed in subsequent cells
    for document in documents:
        print("\n")
        print(document)
except Exception as e:
    print(e)


Successfully loaded 15 document(s).


Doc ID: e05229d7-f2b8-4499-bcc0-548ed4f4a623
Text: Provided proper attribution is provided, Google hereby grants
permission to reproduce the tables and figures in this paper solely
for use in journalistic or scholarly works. Attention Is All You Need
Ashish Vaswani∗ Google Brain avaswani@google.com Noam Shazeer∗ Google
Brain noam@google.com Niki Parmar∗ Google Research nikip@google.com
Jakob Usz...


Doc ID: e47bcfac-4893-413f-8bdb-db50bd45b253
Text: 1 Introduction Recurrent neural networks, long short-term memory
[13] and gated recurrent [7] neural networks in particular, have been
firmly established as state of the art approaches in sequence modeling
and transduction problems such as language modeling and machine
translation [ 35, 2, 5]. Numerous efforts have since continued to push
the bo...


Doc ID: 301c344d-897e-47dd-b773-a1ebab2f2667
Text: Figure 1: The Transformer - model architecture. The Transformer
follows this overall architecture using

# Creating a VectorStoreIndex and query documents

In [13]:
from llama_index.core import VectorStoreIndex

def initialize_index_and_create_query_engine(documents):
    """Initializes the VectorStoreIndex from given documents and embedding model."""
    try:
        index = VectorStoreIndex.from_documents(
            documents=documents,
            embed_model=embedding_model
        )
        # Adding local storage of index  - if you don't want to store then comment below line of code
        index.storage_context.persist()
        ###############################
        query_engine = index.as_query_engine(llm=llm_instance)       
        return query_engine
    except Exception as e:
        print(f"Failed to create index or query engine: {e}")

try:
    query_engine = initialize_index_and_create_query_engine(documents)
    # query = input("Ask me anything about the document: ")    
    query = "What is the document is all about?"        
    response = query_engine.query(query)
    print(response)    
        
except Exception as e:
    print(f"An error occurred during index initialization or querying: {e}")




 The document is about a neural network architecture for machine translation. The architecture is based on attention mechanisms and does not use recurrence or convolutions. The document also discusses the performance of the architecture on several machine translation tasks and compares it to other state-of-the-art models.


# Loading Stored Index
Note: Below code will only run once you have executed index.storage_context.persist()

In [14]:
from llama_index.core import StorageContext, load_index_from_storage

storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context=storage_context)
query_engine = index.as_query_engine(llm=llm_instance)
response = query_engine.query("What is the document is all about?")
print(response)

 The document is about a neural network architecture for machine translation. The architecture is based on attention mechanisms and does not use recurrence or convolutions. The document also discusses the performance of the architecture on several machine translation tasks and compares it to other state-of-the-art models.


# Customize the index
By changing into the global settings module

In [15]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Settings

Settings.llm = llm_instance
Settings.embed_model = embedding_model
Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
Settings.num_output = 512
Settings.context_window = 3900

index = VectorStoreIndex.from_documents(
            documents=documents,
            embed_model=embedding_model
        )
query_engine = index.as_query_engine(llm=llm_instance)
response = query_engine.query("What is the document is all about?")
print(response)


 The document is about a neural network architecture for machine translation. The architecture is based on attention mechanisms and does not use recurrence or convolutions. The document also discusses the performance of the architecture on several machine translation tasks and compares it to other state-of-the-art models.


# Locally running open-source LLMs
Below code needs GPU or TPU based configuration to run

In [None]:
# Loading LLM from HuggingFace locally

from llama_index.llms.huggingface import HuggingFaceLLM ##########################################################################################################################
from llama_index.core import PromptTemplate

system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
"""

# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

llm_locally = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.7, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="StabilityAI/stablelm-tuned-alpha-3b",
    model_name="StabilityAI/stablelm-tuned-alpha-3b",
    device_map="auto",
    stopping_ids=[50278, 50279, 50277, 1, 0],
    tokenizer_kwargs={"max_length": 4096},
    # uncomment this if using CUDA to reduce memory usage
    # model_kwargs={"torch_dtype": torch.float16}
)

In [None]:
# Indexing and querying with locally running LLMs
index = VectorStoreIndex.from_documents(
            documents=documents,
            embed_model=embedding_model
        )
query_engine = index.as_query_engine(llm=llm_locally)
response = query_engine.query("What is the document is all about?")
print(response)
