In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate

from langchain.chains import RetrievalQA

In [6]:
# load Directory
loader = PyPDFDirectoryLoader("./pdfs")
documents = loader.load()

# use recursive character text splitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
final_documents = text_splitter.split_documents(documents)
final_documents[0] 

Document(page_content='Cheat Sheets for AI, Neural Networks, Machine Learning, Deep Learning & Big  Data', metadata={'source': 'pdfs\\AI-Neural-Networks.pdf', 'page': 0})

In [7]:
len(final_documents)

429

# Embeddings Techniques

In [10]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name = "BAAI/bge-small-en-v1.5",
    # sentence-transformers/all-MiniLM-16-v2
    model_kwargs = {'device':'cpu'},
    encode_kwargs = {'normalize_embeddings':True}
        
)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [11]:
import numpy as np

huggingface_embeddings.embed_query(final_documents[0].page_content)

[-0.06496848165988922,
 0.009685522876679897,
 0.0183071568608284,
 -0.022862335667014122,
 0.06531897187232971,
 0.05138126388192177,
 -0.046815723180770874,
 -0.011037111282348633,
 0.047871559858322144,
 0.0019477356690913439,
 0.009998082183301449,
 -0.032154977321624756,
 0.023536011576652527,
 0.0015819245018064976,
 0.03849292919039726,
 -0.0073563032783567905,
 -0.023745521903038025,
 -0.026339871808886528,
 0.014911078847944736,
 -0.0015388522297143936,
 0.012137327343225479,
 -0.026989618316292763,
 -0.011186380870640278,
 -0.022582747042179108,
 -0.01691521517932415,
 -0.01641758717596531,
 -0.014608647674322128,
 -0.11293288320302963,
 -0.06145765259861946,
 -0.19976578652858734,
 0.013145742937922478,
 0.009974789805710316,
 0.04144194349646568,
 -0.018224624916911125,
 -0.010325283743441105,
 -0.009513627737760544,
 0.024716386571526527,
 0.035321373492479324,
 0.03823104500770569,
 0.03034066967666149,
 0.03265480324625969,
 -0.03739696368575096,
 -0.004691861569881439,


In [13]:
import numpy as np
print(np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)))
print(np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)).shape)

[-6.49684817e-02  9.68552288e-03  1.83071569e-02 -2.28623357e-02
  6.53189719e-02  5.13812639e-02 -4.68157232e-02 -1.10371113e-02
  4.78715599e-02  1.94773567e-03  9.99808218e-03 -3.21549773e-02
  2.35360116e-02  1.58192450e-03  3.84929292e-02 -7.35630328e-03
 -2.37455219e-02 -2.63398718e-02  1.49110788e-02 -1.53885223e-03
  1.21373273e-02 -2.69896183e-02 -1.11863809e-02 -2.25827470e-02
 -1.69152152e-02 -1.64175872e-02 -1.46086477e-02 -1.12932883e-01
 -6.14576526e-02 -1.99765787e-01  1.31457429e-02  9.97478981e-03
  4.14419435e-02 -1.82246249e-02 -1.03252837e-02 -9.51362774e-03
  2.47163866e-02  3.53213735e-02  3.82310450e-02  3.03406697e-02
  3.26548032e-02 -3.73969637e-02 -4.69186157e-03 -5.25643602e-02
  8.09933841e-02 -4.69744131e-02  6.81101391e-03 -6.31030230e-03
  4.46074568e-02 -1.44969728e-02 -4.45305407e-02 -3.76432464e-02
 -4.44370843e-02  2.43647825e-02  5.07888338e-03 -2.32454371e-02
  9.50971097e-02  9.67675671e-02  5.47625236e-02  3.30229104e-02
  3.67403999e-02  1.87007

In [14]:
## VectorStore Creation
vectorstore=FAISS.from_documents(final_documents[:120],huggingface_embeddings)

In [16]:
## Query using Similarity Search
query="give me classification algorithm?"
relevant_docments=vectorstore.similarity_search(query)

print(relevant_docments[0].page_content)

Cheat Sheets for AI, Neural Networks, Machine Learning, Deep Learning & Big  Data


In [17]:
retriever=vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":3})
print(retriever)

tags=['FAISS', 'HuggingFaceBgeEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001F6E7245F60> search_kwargs={'k': 3}


In [27]:

import os
os.environ['HUGGINGFACEHUB_API_TOKEN']= os.getenv('HUGGINGFACE_API_KEY')

In [23]:
from langchain_community.llms import HuggingFaceHub

hf=HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",
    model_kwargs={"temperature":0.1,"max_length":500}

)
query="How can I generate primary key values for my table?"
hf.invoke(query)

KeyboardInterrupt: 

In [28]:
import numpy as np
from langchain_community.llms import HuggingFaceHub
import os
os.environ['HUGGINGFACEHUB_API_TOKEN']= os.getenv('HUGGINGFACE_API_KEY')

huggingface_embeddings.embed_query(final_documents[0].page_content)

# print(np.array(huggingface_embeddings.embed_query(final_documents[2].page_content)))

## VectorStore Creation
vectorstore=FAISS.from_documents(final_documents[:200],huggingface_embeddings)
## Query using Similarity Search

# query="How can I generate primary key values for my table?"
# relevant_documents=vectorstore.similarity_search(query)
# print(relevant_documents[0].page_content)
# retriever=vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":3})
# print(retriever)




# from langchain_community.llms import HuggingFaceHub

# hf=HuggingFaceHub(
#     repo_id="mistralai/Mistral-7B-v0.1",
#     model_kwargs={"temperature":0.1,"max_length":500}

# )
# query="How can I generate primary key values for my table?"
# hf.invoke(query)



In [29]:
# Query using Similarity Search

query="How can I eliminate duplicate values in a table"
relevant_documents=vectorstore.similarity_search(query)
print(relevant_documents[0].page_content)
retriever=vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":3})
print(retriever)

11. To reverse
12. How can I eliminate duplicate values in a table?
Choose one of the following queries to identify or remove duplicate rows from a table leaving one
record:
Method 1:
DELETE FROM table_name A WHERE ROWID > (SELECT min (rowid) FROM table_name B
tags=['FAISS', 'HuggingFaceBgeEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001F6E9257EB0> search_kwargs={'k': 3}


In [30]:
from langchain_community.llms import HuggingFaceHub

hf=HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",
    model_kwargs={"temperature":0.1,"max_length":500}

)
query="How can I generate primary key values for my table?"
hf.invoke(query)

HfHubHTTPError: 429 Client Error: Too Many Requests for url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-v0.1 (Request ID: -wLdQVm6PWXu1WO1nrZe5)

Rate limit reached. You reached free usage limit (reset hourly). Please subscribe to a plan at https://huggingface.co/pricing to use the API at this rate

In [None]:
# Hugging Face models can be run locally through the HuggingFacePipeline class.
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="mistralai/Mistral-7B-v0.1",
    task="text-generation",
    pipeline_kwargs={"temperature": 0, "max_new_tokens": 300}
)

llm = hf 
llm.invoke(query)

In [None]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
 """

In [None]:
prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])

In [None]:
retrievalQA=RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
)

In [None]:
query="""DIFFERENCES IN THE
UNINSURED RATE BY STATE
IN 2022"""

In [None]:
# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])