In [4]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA


In [7]:
loader = PyPDFDirectoryLoader("./data")
documents = loader.load()


In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
final_documents = text_splitter.split_documents(documents)

In [11]:
len(final_documents)

173

In [14]:
embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en-v1.5",
                                      model_kwargs={"device": "cpu"},
                                       encode_kwargs={"normalize_embeddings":True})

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [16]:
import numpy as np
np.array(embeddings.embed_query(final_documents[0].page_content))

array([-7.54101425e-02,  1.54635729e-03, -3.05978116e-02, -1.22472504e-02,
       -5.44820577e-02,  4.50910553e-02,  5.79731306e-03,  2.39261910e-02,
       -1.22917453e-02,  1.06775798e-02,  5.07736877e-02, -3.55921648e-02,
        1.68812759e-02, -1.66421086e-02, -4.29419018e-02,  3.02981474e-02,
        2.41099093e-02, -4.68799919e-02, -8.71084724e-03,  4.49832194e-02,
       -5.82353957e-02, -4.84270714e-02, -7.50156771e-03, -3.01131476e-02,
        4.60392982e-02,  3.59244719e-02,  2.34590881e-02,  1.42563591e-02,
       -4.93128784e-02, -1.81305006e-01, -8.82033445e-03,  1.46436300e-02,
        3.11213583e-02,  9.15861130e-03, -3.27881947e-02, -2.58409847e-02,
       -7.41937431e-03, -2.82753967e-02,  4.58376994e-03,  2.27838419e-02,
        2.23458074e-02,  5.42101935e-02, -9.42254439e-03,  6.21082215e-03,
        6.84641525e-02,  1.23670027e-02,  3.18416208e-02, -6.85660839e-02,
        2.24419329e-02,  1.72630791e-02, -5.25366096e-03, -7.36971721e-02,
        2.17331006e-04,  

In [20]:
vectorstore = FAISS.from_documents(final_documents[:120],embeddings)


In [21]:
query="what is Prognostic Scoring Systems"
relevant_docments=vectorstore.similarity_search(query)

print(relevant_docments[0].page_content)

previous thrombosis [10]. Recent data shows high-risk mutations in ET of SH2B3, SF3B1, U2AF1, TP53,
IDH2 and EZH2 and show an eﬀect independent of the current scoring systems [7]. Thus, the MIPSS-ET
has been recently developed to include these updated criteria (Table 1) [9].
Several prognostic scoring systems have been developed for MF. Initially, the International
Prognostic Scoring System (IPSS) was developed in 2009, followed by the dynamic IPSS (DIPSS),
with the advantage of the DIPSS being able to be performed at any time point and with the IPSS being
only validated at diagnosis [11]. The advent of increased molecular prognostic markers has led to the
development of scoring systems that incorporate these such as the MIPSS70 (mutation enhanced IPSS)
or rely solely on cytogenetic and molecular markers, i.e., GIPSS (genetically inspired prognostic score


In [22]:
retriever=vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":3})
print(retriever)

tags=['FAISS', 'HuggingFaceBgeEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x00000200358F4C20> search_kwargs={'k': 3}


In [33]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="mistralai/Mistral-7B-v0.1",
    task="text-generation",
    pipeline_kwargs={"temperature": 0, "max_new_tokens": 300}
)

llm = hf 
llm.invoke(query)

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/mistralai/Mistral-7B-v0.1.
401 Client Error. (Request ID: Root=1-67abc63f-600d0d666e780639235b3963;ed73b96b-e7bd-4a7a-8f1c-c16bb2b3ad6d)

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json.
Access to model mistralai/Mistral-7B-v0.1 is restricted. You must have access to it and be authenticated to access it. Please log in.

In [24]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
 """

In [None]:
prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/mistralai/Mistral-7B-v0.1.
401 Client Error. (Request ID: Root=1-67abc5da-73176b9812fb432e07166baf;35353774-34a9-438b-a8bc-d9ef28c19f74)

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json.
Access to model mistralai/Mistral-7B-v0.1 is restricted. You must have access to it and be authenticated to access it. Please log in.

In [None]:
retrievalQA=RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
)

ValidationError: 2 validation errors for LLMChain
llm.is-instance[Runnable]
  Input should be an instance of Runnable [type=is_instance_of, input_value='mistralai/Mistral-7B-v0.1', input_type=str]
    For further information visit https://errors.pydantic.dev/2.10/v/is_instance_of
llm.is-instance[Runnable]
  Input should be an instance of Runnable [type=is_instance_of, input_value='mistralai/Mistral-7B-v0.1', input_type=str]
    For further information visit https://errors.pydantic.dev/2.10/v/is_instance_of