### This is an Example for Building a Retrieval-Augmented Generation (RAG) App with Unstructured IO
### A tool for extracting information from unstructured data and serves as the foundation for a wide range of applications, including information retrieval, data mining, and natural language processing.



In [4]:
!pip install -qU "unstructured-ingest[pdf]" unstructured langchain langchain-community transformers accelerate bitsandbytes sentence-transformers 

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gradio 5.12.0 requires aiofiles<24.0,>=22.0, but you have aiofiles 24.1.0 which is incompatible.
pymilvus 2.5.3 requires grpcio<=1.67.1,>=1.49.1, but you have grpcio 1.69.0 which is incompatible.[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [13]:
import os

os.environ["UNSTRUCTURED_API_KEY"] = "" # Add your key here
os.environ["UNSTRUCTURED_API_URL"] ="https://api.unstructured.io/general/v0/general" # You can find the URL in your personalized dashboard

In [None]:
os.environ["UNSTRUCTURED_API_KEY"]

In [15]:
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
from unstructured_ingest.v2.interfaces import ProcessorConfig
from unstructured_ingest.v2.processes.connectors.local import (
    LocalIndexerConfig,
    LocalDownloaderConfig,
    LocalConnectionConfig,
    LocalUploaderConfig
)
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
from unstructured_ingest.v2.processes.chunker import ChunkerConfig

In [16]:
directory_with_pdfs="content/data"
directory_with_results="content/output"

Pipeline.from_configs(
    context=ProcessorConfig(),
    indexer_config=LocalIndexerConfig(input_path=directory_with_pdfs),
    downloader_config=LocalDownloaderConfig(),
    source_connection_config=LocalConnectionConfig(),
    partitioner_config=PartitionerConfig(
        partition_by_api=True,
        api_key=os.getenv("UNSTRUCTURED_API_KEY"),
        partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
        strategy="hi_res",
        additional_partition_args={
            "split_pdf_page": True,
            "split_pdf_concurrency_level": 15,
            },
        ),
    uploader_config=LocalUploaderConfig(output_dir=directory_with_results)
).run()


Overriding of current TracerProvider is not allowed
2025-01-14 16:07:34,213 MainProcess INFO     created index with configs: {"input_path": "content/data", "recursive": false}, connection configs: {"access_config": "**********"}
2025-01-14 16:07:34,213 MainProcess INFO     Created download with configs: {"download_dir": null}, connection configs: {"access_config": "**********"}
2025-01-14 16:07:34,214 MainProcess INFO     created partition with configs: {"strategy": "hi_res", "ocr_languages": null, "encoding": null, "additional_partition_args": {"split_pdf_page": true, "split_pdf_concurrency_level": 15}, "skip_infer_table_types": null, "fields_include": ["element_id", "text", "type", "metadata", "embeddings"], "flatten_metadata": false, "metadata_exclude": [], "element_exclude": [], "metadata_include": [], "partition_endpoint": "https://api.unstructured.io/general/v0/general", "partition_by_api": true, "api_key": "*******", "hi_res_model_name": null}
2025-01-14 16:07:34,215 MainProcess

In [1]:
import os
directory_with_results="content/output"

from unstructured.staging.base import elements_from_json

def load_processed_files(directory_path):
    elements = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            file_path = os.path.join(directory_path, filename)
            try:
                elements.extend(elements_from_json(filename=file_path))
            except IOError:
                print(f"Error: Could not read file {filename}.")

    return elements

elements = load_processed_files(directory_with_results)

In [2]:
elements

[<unstructured.documents.elements.Title at 0x7af578db7620>,
 <unstructured.documents.elements.NarrativeText at 0x7af5735cfd70>,
 <unstructured.documents.elements.Title at 0x7af5735cfe00>,
 <unstructured.documents.elements.NarrativeText at 0x7af5735cfe90>,
 <unstructured.documents.elements.NarrativeText at 0x7af5735cff20>,
 <unstructured.documents.elements.NarrativeText at 0x7af5735cffb0>,
 <unstructured.documents.elements.Title at 0x7af573638080>,
 <unstructured.documents.elements.Image at 0x7af573638110>,
 <unstructured.documents.elements.NarrativeText at 0x7af5736381a0>,
 <unstructured.documents.elements.NarrativeText at 0x7af573638230>,
 <unstructured.documents.elements.Title at 0x7af5736382c0>,
 <unstructured.documents.elements.NarrativeText at 0x7af573638350>,
 <unstructured.documents.elements.NarrativeText at 0x7af5736383e0>,
 <unstructured.documents.elements.NarrativeText at 0x7af573638470>,
 <unstructured.documents.elements.Image at 0x7af573638500>,
 <unstructured.documents.ele

In [7]:
elements[7].text

'Zero-shot One-shot Few-shot 175B Params Natural Language 60 Prompt ~ - 50 40 30 No Prompt Accuracy (%) " 13B Params 20 ~ 10 - rmmmm———— __———‘,___—————-—__, ..__-____________-.‘—---____ ——moe 1.3B Params e ———— R ———— ) 10 10 Number of Examples in Context (K)'

In [8]:
!pip install -qU langchain-voyageai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [11]:
from langchain_voyageai import VoyageAIEmbeddings
import getpass
if not os.environ.get("VOYAGE_API_KEY"):
  os.environ["VOYAGE_API_KEY"] = getpass.getpass("Enter API key for Voyage AI: ")



In [None]:
os.environ["VOYAGE_API_KEY"]

In [13]:
embeddings = VoyageAIEmbeddings(model="voyage-3")

In [17]:
embedded_query=embeddings.embed_query("hey there")

In [19]:
len(embedded_query)

1024

In [24]:
pip install -qU langchain-cohere

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [29]:
import getpass
import os

if not os.environ.get("COHERE_API_KEY"):
  os.environ["COHERE_API_KEY"] = getpass.getpass("Enter API key for Cohere: ")

from langchain_cohere import CohereEmbeddings

embedding = CohereEmbeddings(model="embed-english-v3.0")

In [30]:
res=embedding.embed_query("hey there")

In [31]:
len(res)

1024

In [32]:
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_cohere import CohereEmbeddings


documents = []
for element in elements:
    metadata = element.metadata.to_dict()
    documents.append(Document(page_content=element.text, metadata=metadata))

db = FAISS.from_documents(documents,  CohereEmbeddings(model="embed-english-v3.0") )
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [34]:
db.save_local("faiss_index_prompt_engineering")

In [38]:
retriever.get_relevant_documents("A prompt is composed with the following components:")

[Document(id='ca8ca968-4abd-4b61-ac9f-07ee48c858fd', metadata={'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 8, 'parent_id': 'c60ae520e2e7819c45eab33f5d7b4134', 'filename': '22-promptengg.pdf', 'data_source': {'record_locator': {'path': '/workspaces/LLM_Examples/Huggingface_cookbooks/RAG/content/data/22-promptengg.pdf'}, 'date_modified': '1736870615.0958474', 'date_processed': '1736870854.234318', 'permissions_data': [{'mode': 33206}]}}, page_content='• A prompt is composed with the following components:'),
 Document(id='7d0784f7-557e-4e98-a501-4184078dad62', metadata={'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 8, 'filename': '22-promptengg.pdf', 'data_source': {'record_locator': {'path': '/workspaces/LLM_Examples/Huggingface_cookbooks/RAG/content/data/22-promptengg.pdf'}, 'date_modified': '1736870615.0958474', 'date_processed': '1736870854.234318', 'permissions_data': [{'mode': 33206}]}}, page_content='Elements of a Prompt'),
 Document(id

In [39]:
!pip install langchain-huggingface

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Downloading langchain_huggingface-0.1.2-py3-none-any.whl (21 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.1.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [46]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import ChatHuggingFace

from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    repo_id="meta-llama/Llama-3.1-8B-Instruct",
    task="text-generation",
)

prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)



In [47]:
rag_chain.invoke("what is prompt engineering?")

ReadTimeout: (ReadTimeoutError("HTTPSConnectionPool(host='api-inference.huggingface.co', port=443): Read timed out. (read timeout=120)"), '(Request ID: f5493bad-507f-43d6-b72a-ef361909a1d3)')