In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from langchain.vectorstores import Chroma
import sys
# sys.path.append('Your Virtual Environment Python-site-packages Path')
import chromadb


import os
from langchain.schema import Document

## Steps for Setting Up and Running Retrieval-Augmented Generation Pipeline

1. **Load the Dataset**: Start by reading all text files from a specified directory. Each file's content is stored in a list to prepare it for processing.

2. **Chunk the Data**: Use `RecursiveCharacterTextSplitter` to break down documents into smaller, manageable chunks. This ensures context is preserved with a consistent chunk size and overlap.

3. **Embed Data with Hugging Face Embeddings**: Transform each chunk into a vector using Hugging Face embeddings, and store these vectors in a Chroma database. This allows for efficient retrieval of relevant information.

4. **Perform Similarity Search**: Query the Chroma vector store to identify relevant documents that closely match the query, leveraging vector similarity.

5. **Set Up Language Model for QA**: Use a Hugging Face language model to answer questions. This is done with a prompt template that encourages the model to answer only based on the provided context.

6. **Run RetrievalQA Pipeline**: Execute the RetrievalQA pipeline to retrieve and generate answers based on context-rich information from the dataset. This delivers accurate responses to the input queries.


In [3]:
# Retrieve data from source file
# Define the directory path
directory_path = './Research_Papers'

# List to hold the contents of all text files
documents = []

# Walk through the directory and subdirectories
for root, dirs, files in os.walk(directory_path):
    for file in files:
        if file.endswith('.txt'):  # Check if the file is a text file
            file_path = os.path.join(root, file)  # Get the full file path
            with open(file_path, 'r', encoding='utf-8') as f:
                documents.append(f.read())  # Read and store the file content

# Convert raw text documents to Document objects
document_objects = [Document(page_content=text) for text in documents]

len(document_objects)


8

In [4]:
# transform the data into chunks

splittered_text = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splittered_text.split_documents(document_objects)
len(chunks)




869

In [5]:
# embedding using Huggingface
huggingface_embeddings=HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True}
)

  from tqdm.autonotebook import tqdm, trange
  return re.sub(r"([a-z])([A-Z])", "\g<1> \g<2>", class_name)
  if dataset_name and re.match("_dataset_\d+", dataset_name):
  """


In [6]:
# check if huggingface embedding is working
import numpy as np
np.array(huggingface_embeddings.embed_query(chunks[0].page_content))


array([-6.53987825e-02,  6.02318812e-03,  1.64900180e-02, -5.47329383e-03,
       -3.80241685e-02, -9.95601248e-03, -1.56732462e-02,  3.44463019e-03,
        3.37717384e-02, -1.05048819e-02,  4.84604500e-02, -3.55521664e-02,
        5.99556305e-02,  4.92328815e-02,  7.00458288e-02, -2.94353236e-02,
       -6.17482550e-02,  2.51754411e-02,  4.39898521e-02,  4.98125190e-03,
        1.53710805e-02, -2.38625649e-02,  5.13210834e-04, -1.43946223e-02,
       -3.33078541e-02, -5.02960756e-03, -4.24624160e-02, -1.70046296e-02,
       -2.11626776e-02, -2.13753298e-01,  3.35602611e-02,  2.64325868e-02,
        3.69183980e-02,  1.02715958e-02, -3.36751081e-02,  9.36793089e-02,
        1.80202462e-02, -2.30113082e-02, -2.64201388e-02,  1.11987246e-02,
       -2.20576376e-02,  6.34450465e-02,  1.76248886e-02, -2.46591661e-02,
       -7.13710964e-04, -6.09243475e-02, -3.36745847e-03, -4.76193391e-02,
       -6.56262040e-02, -1.86457094e-02,  3.17258835e-02, -8.14341232e-02,
       -2.43747532e-02,  

In [7]:



# Initialize the Chroma vector store
vectorstore = Chroma(embedding_function=huggingface_embeddings)

# Add documents to the vector store
# Assuming 'chunks' is a list of Document objects with page_content attributes
documents = [Document(page_content=str(chunk)) for chunk in chunks]
vectorstore.add_documents(documents)


  warn_deprecated(


['9faecd1a-4542-4300-8bbe-3876398f5bc6',
 '08b38f75-bf9f-43e8-8cdb-91b8776577e9',
 'd4827770-0cec-4e98-b44c-7f8ed6365a31',
 '282b080d-79a9-4ce6-a2bc-b8fed3fc6824',
 'd7ea372b-3c90-4e9f-9882-751b2ea8a018',
 '6d8b6e52-527d-4d00-9c74-4ba2f5dde73e',
 '4303ae09-a609-4c42-8911-9892264f270b',
 'c4314df0-a772-4ae7-8bc3-d2142f2d85b3',
 '5cad06bb-8a7d-4efb-91b0-3fa1897adac0',
 'bdd58e78-0a7c-4305-a02f-c2722cf5e3d9',
 'a4ddfdf2-bcaa-4e83-9a80-8dd565edc873',
 '95406f38-78a8-427c-866d-46b783feb4ea',
 'b2b4e6e2-59e9-419e-bc02-28fb447a2b47',
 'e1a2b5fc-20b8-494f-afc2-e19dd8ebea4c',
 'fbc86efe-f094-426e-b565-fbacd9ddc59d',
 '76d6feed-9abe-4684-aa0c-f1a5d0a35299',
 'c8344ee6-111b-42e1-89be-21e9d52c829a',
 'b5074fd6-1a04-4b87-98f4-37d9d888f3fc',
 '9cced60b-00fd-4924-b4fb-26d4b4ce4299',
 'e574f586-86d4-41fd-9ccc-5ace4b0b1643',
 '805d735c-8e0a-48dd-b9ea-4e2d661686f4',
 '4709ead8-cacb-4139-a601-dd45137a1e61',
 '82963508-ab34-404d-b3a3-3cbbf849de31',
 '3cafad95-65ca-4662-be02-a516cb708ad0',
 '1f1e7797-526b-

In [8]:
# Assuming your Chroma vectorstore is already initialized as 'vectorstore'

query = "What are the security implications on DNS in Australia?"

# Perform similarity search using Chroma
relevant_documents = vectorstore.similarity_search(query)

# Print the content of the most relevant document
print(relevant_documents[0].page_content)


page_content='1
Analysis of DNS Dependencies and their Security
Implications in Australia: A Comparative Study of
General and Indigenous Populations
Niousha Nazemi , Member, IEEE, Omid Tavallaie , Anna Maria Mandalari ,
Hamed Haddadi , Member, IEEE, Ralph Holz , and Albert Y. Zomaya , Fellow, IEEE
Abstract—This paper investigates the impact of internet centralization on DNS provisioning, particularly its effects on vulnerable populations such as the indigenous people of Australia. We
analyze the DNS dependencies of Australian government domains
that serve indigenous communities compared to those serving
the general population. Our study categorizes DNS providers
into leading (hyperscaler, US-headquartered companies), nonleading (smaller Australian-headquartered or non-Australian
companies), and Australian government-hosted providers. Then,
we build dependency graphs to demonstrate the direct dependency between Australian government domains and their DNS'


In [9]:
retriever=vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":3})
print(retriever)

tags=['Chroma', 'HuggingFaceBgeEmbeddings'] vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x72ef97940dd0> search_kwargs={'k': 3}


In [None]:
import os
os.environ['HUGGINGFACEHUB_API_TOKEN']="Your Huggingface API Token"

from langchain_community.llms import HuggingFaceHub

hf=HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",
    model_kwargs={"temperature":0.1,"max_length":500}

)
query="What are the security implications on DNS in Australia?"
hf.invoke(query)

'What are the security implications on DNS in Australia?\n\nThe Australian Government has recently released a report on the security implications of DNS in Australia. The report outlines the risks associated with DNS and provides recommendations for mitigating those risks.\n\nThe report finds that DNS is a critical component of the internet infrastructure and that it is vulnerable to a number of security threats. These threats include DDoS attacks, man-in-the-middle attacks, and spoofing.\n\nThe report recommends that the Australian Government'

In [12]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
 """

In [13]:
prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])


In [14]:
retrievalQA=RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
)

In [15]:
query="Tell me about plasticity Loss in Deep Reinforcement Learning"
# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])


Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

page_content='deep RL, such as training instabilities, scaling failures, overestimation bias, and insufficient
exploration. With this survey, we aim to provide an overview of the emerging research on
plasticity loss for academics and practitioners of deep reinforcement learning. First, we
propose a unified definition of plasticity loss based on recent works, relate it to definitions
from the literature, and discuss metrics for measuring plasticity loss. Then, we categorize
and discuss numerous possible causes of plasticity loss before reviewing currently employed
mitigation strategies. Our taxonomy is the first systematic overview of the current state of
the field. Lastly, we discuss prevalent issues within the literature, such as a necessity for
broader evaluation, and provide recommendations for future research like gaining a better
understanding between an ag