In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_classic.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_classic.prompts import PromptTemplate
from langchain_classic.chains import retrieval_qa
from langchain_classic.vectorstores import FAISS

In [2]:
loader = PyPDFDirectoryLoader("./us_census")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap= 200)
final_documents = text_splitter.split_documents(documents)
len(final_documents)

316

In [3]:
huggingface_embedding = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5", # sentence-transfomer/all-miniLM-16-v2
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True}

)

  huggingface_embedding = HuggingFaceBgeEmbeddings(


In [4]:
import numpy as np
np.array(huggingface_embedding.embed_query(final_documents[0].page_content))

array([-0.07903482, -0.01134113, -0.02312095,  0.02844462,  0.05053345,
        0.05317827, -0.01907792,  0.03456027, -0.10211367, -0.02915701,
        0.08524261,  0.05650727, -0.02545437, -0.0330849 , -0.00635737,
        0.04090865, -0.00628106,  0.00356743, -0.03854126,  0.03667682,
       -0.04289803,  0.03425248, -0.03116897, -0.03793732,  0.01728392,
        0.01214926,  0.0065312 ,  0.01463566, -0.05529055, -0.15320709,
        0.00730848,  0.03202945, -0.04701129, -0.01595975,  0.01874448,
        0.02642939, -0.02306378,  0.0843804 ,  0.04182491,  0.05278175,
       -0.03057606,  0.01564261, -0.01689075,  0.00529407, -0.02417431,
        0.00412995, -0.01889938, -0.00150626, -0.00836939, -0.03390063,
        0.03515958, -0.00553127,  0.04910937,  0.05971859,  0.0561596 ,
       -0.05105156,  0.01475134, -0.0184996 , -0.03284643,  0.03576627,
        0.04947709, -0.00938881, -0.26202112,  0.09750332,  0.01715691,
        0.04781387, -0.00556316, -0.00298307, -0.02207358, -0.04

In [5]:
vectorstore = FAISS.from_documents(final_documents[:120],huggingface_embedding)


In [7]:
query= "What is health Indurance Coverage"
relevant_documents = vectorstore.similarity_search(query)
print(relevant_documents[0].page_content)

private health insurance as a plan provided through an employer 
or a union, coverage purchased directly by an individual from an 
insurance company or through an exchange (such as healthcare.
gov), or coverage through TRICARE. Public insurance coverage 
includes federal programs (such as Medicare, Medicaid, and the 
Children’s Health Insurance Program or CHIP), individual state 
health plans, and CHAMPVA (Civilian Health and Medical Program 
at the Department of Veterans Affairs), as well as care provided 
by the Department of Veterans Affairs. In the ACS, people are 
considered insured if they were covered by any of these types 
of health insurance at time of interview. People are considered 
uninsured if they were not covered by any of these types of health 
insurance at time of interview or if they only had coverage through 
the Indian Health Service (IHS), as IHS coverage is not considered 
comprehensive.


In [9]:
retriver = vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":3})

In [15]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["HUGGINGFACEHUB_API_TOKEN"]=os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [27]:
from langchain_huggingface import HuggingFaceEndpoint
from langchain_huggingface import ChatHuggingFace
from langchain_core.messages import HumanMessage

endpoint = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.2",
    task="conversational",  # Featherless AI requirement
    temperature=0.6,
    max_new_tokens=256,
    timeout=120
)

# Pass endpoint as llm parameter
llm = ChatHuggingFace(llm=endpoint)  # ✅ Correct

response = llm.invoke([HumanMessage(content="What is the capital of USA?")])
print(response.content)

 The United States is not a single entity with a capital city. It is a federal republic composed of 50 states, each with its own capital city. The federal government, which is based in Washington, D.C., is not a state and does not have a capital city in the same way that the 50 states do.

Washington, D.C., is often called the "capital city" of the United States due to the presence of the national government there. However, it is not a state and does not have the same status as a state capital.

Here is a list of the state capitals in the United States:

* Alabama: Montgomery
* Alaska: Juneau
* Arizona: Phoenix (state capital) and Phoenix or Tucson (legislative seat, depending on the session)
* Arkansas: Little Rock
* California: Sacramento
* Colorado: Denver
* Connecticut: Hartford
* Delaware: Dover
* Florida: Tallahassee
* Georgia: Atlanta (executive and judicial) and Savannah or Atlanta (legislatve)
* Hawaii: Honolulu (state capital) or Honolulu or Hilo (legislative session)
* Idaho

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]