# Local RAG

In [1]:
# Checking the available device and then installing the appropriate version of FAISS
import torch

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"

device

'cpu'

## Config Local LLM API access

In [2]:
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

custom_url = 'http://localhost:11434'

llm = ChatOllama(
    base_url = custom_url,
    model = 'mistral'
)
print("## Local LLM loaded ##")

# Test API
prompt = ChatPromptTemplate.from_template("Tell me a short joke")
chain = prompt | llm | StrOutputParser()
print(chain.invoke({"topic": "Space travel"}))

## Local LLM loaded ##
 Why don't scientists trust atoms?

Because they make up everything!


## Load Docs

In [3]:
from langchain_community.document_loaders import DirectoryLoader

path = "./docs"
loader = DirectoryLoader(path)
docs = loader.load()

In [4]:
len(docs)

1

In [5]:
from langchain.text_splitter import SentenceTransformersTokenTextSplitter

text_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)
documents = text_splitter.split_documents(docs)

print (f'You have {len(documents)} document(s) in your data')

  from .autonotebook import tqdm as notebook_tqdm


You have 272 document(s) in your data


In [6]:
documents[10]

Document(page_content='being a global - scale cloud, azure uniquely offers hybrid consistency, developer productivity, ai capabilities, and trusted security and compliance. we see more emerging use cases and needs for compute and security at the edge and are accelerating our innovation across the spectrum of intelligent edge devices, from internet of things ( “ iot ” ) sensors to gateway devices and edge hardware to build, manage, and secure edge workloads. with azure stack, organizations can extend azure into their own datacenters to create a consistent stack across the public cloud and the intelligent edge. 4 part i item 1 our hybrid infrastructure consistency spans security, compliance, identity, and management, helping to support the real - world needs and evolving regulatory requirements of commercial customers and enterprises. our industry clouds bring together capabilities across the entire microsoft cloud, along with industry - specific customizations, to improve time to value,

In [7]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings

!mkdir -p vector-db
persist_directory = "./vector-db"

embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
db = Chroma.from_documents(documents, embeddings, persist_directory=persist_directory)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [8]:
# !mkdir -p vector-db
# client = Chroma.PersistentClient(path="./vector-db/")

In [9]:
query = "What was the total revenue?"
query_embedding = embeddings.embed_query(query) # create vector embedding of query

docs = db.similarity_search_by_vector(query_embedding)
docs_page_content = " ".join([d.page_content for d in docs]) # extract and combine results into one doc

print(docs_page_content)

in millions ) year ended june 30, united states ( a ) other countries $ 2022 100, 218 $ 98, 052 2021 83, 953 $ 84, 135 total $ 198, 270 $ 168, 088 $ ( a ) includes billings to oems and certain multinational organizations because of the nature of these businesses and the impracticability of determining the geographic source of the revenue. 94 2020 46, 398 48, 366 48, 251 143, 015 18, 724 18, 324 15, 911 52, 959 2020 73, 160 69, 855 143, 015 part ii item 8 revenue, classified by significant product and service offerings, was as follows : ( in millions ) year ended june 30, server products and cloud services office products and cloud services windows gaming linkedin search and news advertising enterprise services devices other $ 2022 67, 321 $ 44, 862 24, 761 16, 230 13, 816 11, 591 7, 407 6, 991 5, 291 2021 52, 589 $ 39, 872 22, 488 15, 370 10, 289 9, 267 6, 943 6, 791 4, 479 total $ 198, 270 $ 168, 088 $ we in millions ) year ended june 30, united states ( a ) other countries $ 2022 100

In [10]:
from langchain import PromptTemplate
from langchain.chains import LLMChain

prompt = PromptTemplate(
    input_variables=["question", "docs"],
    template="""
      This bot engages in discussions on a wide range of topics, including cultural, philosophical, and political matters. It analyzes provided articles to inform its responses. Please adhere to the truth. If no resources are available, share your personal opinion.

      Question to be answered: {question}

      Referenced articles for analysis: {docs}

      Instructions for the bot:
      1. Extract and use only factual information from the specified documents.
      2. Highlight key phrases and evidence from the articles to support your answers.
      3. If the articles do not sufficiently cover the topic to provide an informed response, please state, "I don't have enough information to answer this question."

      Remember, the goal is to provide well-informed, accurate, and thoughtful responses based on the available resources. If personal opinion is necessary due to a lack of information, it should be clearly identified as such.
      """,
    )

chain = LLMChain(llm=llm, prompt=prompt)

response = chain.run(question=query, docs=docs_page_content,return_source_documents=True)
response_text = str(response)

print(response_text)

  warn_deprecated(


 Based on the provided articles, the total revenue for the year ended June 30, 2022, for both the United States and other countries was $198,270 million. However, it is important to note that this figure includes billings to Original Equipment Manufacturers (OEMs) and certain multinational organizations due to the nature of their business and the impracticability of determining the geographic source of the revenue.

Additionally, for the same time period, the revenue was classified as follows: server products and cloud services - $67,321 million; office products and cloud services - $44,862 million; windows gaming - $16,230 million; linkedin search and news advertising - $13,816 million; enterprise services - $11,479 million; and more personal computing - $4,479 million.

Moreover, as of June 30, 2022, the unearned revenue by segment was: productivity and business processes - $24,558 million; intelligent cloud - $19,371 million; and more personal computing - $4,479 million. The changes

In [11]:
prompt_eval = PromptTemplate(
        input_variables=["answer", "docs"],
        template="""
          Your task is to assess whether the provided response accurately and faithfully reflects the context of a given question or statement.

          Evaluate the following response: {answer}
          Reference article for evaluation: {docs}

          Instructions for the evaluation:
          1. Start your evaluation with a clear "Yes" or "No" to indicate if the response is faithful to the context provided by the reference article.
          2. Provide a detailed reason for your judgment. Mention specific aspects of the response and the article that support your evaluation. Highlight any direct correlations, discrepancies, or notable omissions in the response compared to the factual content of the article.
          3. If the response incorporates elements not found in the article but remains relevant and truthful to the broader topic, please acknowledge this as a factor in your assessment.

          Your evaluation should focus on the accuracy, relevance, and completeness of the response in relation to the information presented in the referenced article. This ensures a thorough and reasoned assessment of the response's faithfulness to the context.
          """,
    )

eval_chain = LLMChain(llm=llm, prompt=prompt_eval)

evals = eval_chain.run(answer=response_text, docs=docs_page_content)
eval_text = str(evals)

print(eval_text)

 Yes, the response is generally faithful to the context provided by the reference article. However, there are some discrepancies and notable omissions that should be addressed:

1. The response accurately reports Microsoft's revenue for the year ended June 30, 2022 ($198.3 billion) and the respective revenue breakdown by product and service offerings. This information is directly from the article.

2. However, there are some discrepancies in the figures reported for unearned revenue and changes in unearned revenue between the response and the article. The article states that Microsoft's part II item 8 revenue allocated to remaining performance obligations was $193 billion as of June 30, 2022. In contrast, the response reports a figure of $193 billion for unearned revenue, which is not explicitly mentioned in the article and seems to include both deferred revenue and amounts that will be invoiced and recognized as revenue in future periods (as indicated by "remaining performance obligat

In [12]:
from pprint import pprint

# Print question, answer, and evaluations
print("\n\n> Question:")
pprint(query)
print("\n> Answer:")
pprint(response)
print("\n> Eval:")
pprint(evals)

# Print the relevant sources used for the answer
print("----------------------------------SOURCE DOCUMENTS---------------------------")
for document in docs:
    print("\n> " + document.metadata["source"])
    pprint(document.page_content[:1000])
print("----------------------------------SOURCE DOCUMENTS---------------------------")



> Question:
'What was the total revenue?'

> Answer:
(' Based on the provided articles, the total revenue for the year ended June '
 '30, 2022, for both the United States and other countries was $198,270 '
 'million. However, it is important to note that this figure includes billings '
 'to Original Equipment Manufacturers (OEMs) and certain multinational '
 'organizations due to the nature of their business and the impracticability '
 'of determining the geographic source of the revenue.\n'
 '\n'
 'Additionally, for the same time period, the revenue was classified as '
 'follows: server products and cloud services - $67,321 million; office '
 'products and cloud services - $44,862 million; windows gaming - $16,230 '
 'million; linkedin search and news advertising - $13,816 million; enterprise '
 'services - $11,479 million; and more personal computing - $4,479 million.\n'
 '\n'
 'Moreover, as of June 30, 2022, the unearned revenue by segment was: '
 'productivity and business proces

In [None]:
def get_response_from_query(db, query):

    # embed query, find k nearest docs, combine docs
    query_embedding = embeddings.embed_query(query)
    docs = db.similarity_search_by_vector(query_embedding)
    docs_page_content = " ".join([d.page_content for d in docs])


    # generate LLM answer based on similar docs
    prompt = PromptTemplate(
        input_variables=["question", "docs"],
        template="""
      This bot engages in discussions on a wide range of topics, including cultural, philosophical, and political matters. It analyzes provided articles to inform its responses. Please adhere to the truth. If no resources are available, share your personal opinion.

      Question to be answered: {question}

      Referenced articles for analysis: {docs}

      Instructions for the bot:
      1. Extract and use only factual information from the specified documents.
      2. Highlight key phrases and evidence from the articles to support your answers.
      3. If the articles do not sufficiently cover the topic to provide an informed response, please state, "I don't have enough information to answer this question."

      Remember, the goal is to provide well-informed, accurate, and thoughtful responses based on the available resources. If personal opinion is necessary due to a lack of information, it should be clearly identified as such.
      """
      ,
    )

    chain = LLMChain(llm=llm, prompt=prompt)
    response = chain.run(question=query, docs=docs_page_content,return_source_documents=True)
    r_text = str(response)

    # use LLM to evaluate answer
    prompt_eval = PromptTemplate(
        input_variables=["answer", "docs"],
        template="""
          Your task is to assess whether the provided response accurately and faithfully reflects the context of a given question or statement.

          Evaluate the following response: {answer}
          Reference article for evaluation: {docs}

          Instructions for the evaluation:
          1. Start your evaluation with a clear "Yes" or "No" to indicate if the response is faithful to the context provided by the reference article.
          2. Provide a detailed reason for your judgment. Mention specific aspects of the response and the article that support your evaluation. Highlight any direct correlations, discrepancies, or notable omissions in the response compared to the factual content of the article.
          3. If the response incorporates elements not found in the article but remains relevant and truthful to the broader topic, please acknowledge this as a factor in your assessment.

          Your evaluation should focus on the accuracy, relevance, and completeness of the response in relation to the information presented in the referenced article. This ensures a thorough and reasoned assessment of the response's faithfulness to the context.
          """
          ,
    )

    chain_part_2 = LLMChain(llm=llm, prompt=prompt_eval)
    evals = chain_part_2.run(answer=r_text, docs=docs_page_content)

    return response,docs,evals


import gradio as gr

def greet(query):
    answer,sources,evals = get_response_from_query(db,query)
    return answer,sources,evals


demo = gr.Interface(fn=greet,
                    title="Local-RAG",
                    inputs=["text"],
                    outputs=[gr.components.Textbox(lines=3, label="Response"),
                             gr.components.Textbox(lines=3, label="Source"),
                             gr.components.Textbox(lines=3, label="Evaluation")],
                   )

demo.launch(share=True, debug=True)

Running on local URL:  http://127.0.0.1:7860


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Running on public URL: https://41bd5f6b511b6fc36a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
