In [31]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate

from langchain.chains import RetrievalQA

In [32]:
# load Directory
loader = PyPDFDirectoryLoader("./pdfs")
documents = loader.load()

# use recursive character text splitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
final_documents = text_splitter.split_documents(documents)
final_documents[0] 

Document(page_content="COMPLEX QUERIES\n- 1 -1.To find The Nth Maximum Salary.\nSELECT DISTINCT SAL FROM EMP A WHERE &N=(SELECT COUNT (DISTINCT B.SAL)\nFROM EMP B WHERE A.SAL<=B.SAL);\n2.To find the no. of columns for particular table.\nSELECT COUNT (COLUMN_NAME) FROM USER_TAB_COLUMNS\nWHERE TABLE_NAME = 'DEPT'\n3.To use Exists Clause.\nSELECT DNAME, DEPTNO FROM DEPT WHERE EXISTS (SELECT * FROM EMP WHERE\nDEPT.DEPTNO = EMP.DEPTNO)\n4. To Find The Not Null Column Alone In A Table.SELECT COLUMN_NAME FROM\nUSER_TAB_COLUMNS WHERE NULLABLE = 'N' AND TABLE_NAME = 'COUNTRY'\n5.To delete The Duplicate Rows Alone in A Table.\nDELETE DEPT WHERE ROWID NOT IN (SELECT MAX (ROWID) FROM DEPT GROUP BY \nDEPTNO HAVING COUNT (*) >=1)\n6.To find The Max Salary without MAX Function.\n   1. SELECT DISTINCT SAL FROM EMP1 WHERE SAL NOT IN\n      (SELECT SAL FROM EMP1 WHERE SAL < ANY (SELECT SAL FROM EMP1))\n   2.SELECT SAL FROM EMP WHERE SAL >= ALL (SELECT SAL FROM EMP)\n7. Alternate for DESC.", metadata={'s

In [33]:
len(final_documents)

371

# Embeddings Techniques

In [34]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name = "BAAI/bge-small-en-v1.5",
    # sentence-transformers/all-MiniLM-16-v2
    model_kwargs = {'device':'cpu'},
    encode_kwargs = {'normalize_embeddings':True}
        
)

In [35]:
import numpy as np

huggingface_embeddings.embed_query(final_documents[0].page_content)

[-0.07861349731683731,
 -0.005937101785093546,
 -0.031239187344908714,
 -0.05031199008226395,
 0.02245890535414219,
 -0.01985127665102482,
 0.03910008817911148,
 0.006538356654345989,
 0.050013743340969086,
 0.0024426879826933146,
 0.093374103307724,
 0.010018597356975079,
 0.02274908497929573,
 -0.03802958130836487,
 -0.03988877683877945,
 -0.02865653671324253,
 -0.01467342209070921,
 0.026870576664805412,
 -0.02407204546034336,
 0.01641157455742359,
 0.04104001447558403,
 -0.052799031138420105,
 -0.043114665895700455,
 -0.05160573497414589,
 0.09217653423547745,
 0.050475139170885086,
 0.011854428797960281,
 0.007469608448445797,
 0.008622185327112675,
 -0.2090425342321396,
 -0.0030440737027674913,
 0.015457197092473507,
 0.0076028392650187016,
 -0.008266400545835495,
 0.04264729097485542,
 -0.042836882174015045,
 0.05192799121141434,
 -0.035362206399440765,
 0.03540781885385513,
 0.04437562823295593,
 -0.018238490447402,
 -0.028581663966178894,
 -0.021125834435224533,
 -0.0077523887

In [36]:
import numpy as np
print(np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)))
print(np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)).shape)

[-7.86134973e-02 -5.93710179e-03 -3.12391873e-02 -5.03119901e-02
  2.24589054e-02 -1.98512767e-02  3.91000882e-02  6.53835665e-03
  5.00137433e-02  2.44268798e-03  9.33741033e-02  1.00185974e-02
  2.27490850e-02 -3.80295813e-02 -3.98887768e-02 -2.86565367e-02
 -1.46734221e-02  2.68705767e-02 -2.40720455e-02  1.64115746e-02
  4.10400145e-02 -5.27990311e-02 -4.31146659e-02 -5.16057350e-02
  9.21765342e-02  5.04751392e-02  1.18544288e-02  7.46960845e-03
  8.62218533e-03 -2.09042534e-01 -3.04407370e-03  1.54571971e-02
  7.60283927e-03 -8.26640055e-03  4.26472910e-02 -4.28368822e-02
  5.19279912e-02 -3.53622064e-02  3.54078189e-02  4.43756282e-02
 -1.82384904e-02 -2.85816640e-02 -2.11258344e-02 -7.75238872e-03
  8.80819466e-03 -1.90681815e-02 -3.10141221e-02 -3.40304412e-02
  4.55043800e-02  6.46890104e-02 -6.65351152e-02  1.39998402e-02
 -4.88021150e-02  4.36044410e-02  4.72431332e-02  2.47383546e-02
  7.38465935e-02  2.30981018e-02 -5.48869744e-02 -5.85781150e-02
  2.28234772e-02  1.04729

In [37]:
## VectorStore Creation
vectorstore=FAISS.from_documents(final_documents[:200],huggingface_embeddings)

In [39]:
## Query using Similarity Search
query="How does one select the TOP N rows from a table?"
relevant_docments=vectorstore.similarity_search(query)

print(relevant_docments[0].page_content)

(TEMP.ROWNUM, 4) = 0;
How does one select the TOP N rows from a table?
SELECT * FROM MY_TABLE A WHERE 10 >= (SELECT COUNT (DISTINCT MAXCOL) FROM
MY_TABLE B WHERE B.MAXCOL >= A.MAXCOL) ORDER BY MAXCOL DESC;
How does one code a tree-structured query?
This is definitely non-relational (enough to kill Codd and then make him roll in his grave) and is a
feature I have not seen in the competition.


In [40]:
retriever=vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":3})
print(retriever)

tags=['FAISS', 'HuggingFaceBgeEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001F6FF75E0E0> search_kwargs={'k': 3}


In [41]:

import os
os.environ['HUGGINGFACEHUB_API_TOKEN']= "your huggingface api token"

In [None]:
from langchain_community.llms import HuggingFaceHub

hf=HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",
    model_kwargs={"temperature":0.1,"max_length":500}

)
query="How does one select the TOP N rows from a table?"
hf.invoke(query)

In [43]:
# Query using Similarity Search

query="How does one select the TOP N rows from a table?"
relevant_documents=vectorstore.similarity_search(query)
print(relevant_documents[0].page_content)
retriever=vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":3})
print(retriever)

(TEMP.ROWNUM, 4) = 0;
How does one select the TOP N rows from a table?
SELECT * FROM MY_TABLE A WHERE 10 >= (SELECT COUNT (DISTINCT MAXCOL) FROM
MY_TABLE B WHERE B.MAXCOL >= A.MAXCOL) ORDER BY MAXCOL DESC;
How does one code a tree-structured query?
This is definitely non-relational (enough to kill Codd and then make him roll in his grave) and is a
feature I have not seen in the competition.
tags=['FAISS', 'HuggingFaceBgeEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001F6FF75E0E0> search_kwargs={'k': 3}


In [None]:
# Hugging Face models can be run locally through the HuggingFacePipeline class.
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    # model_id="mistralai/Mistral-7B-v0.1",
    model_id = "meta-llama/Meta-Llama-3-8B",
    task="text-generation",
    pipeline_kwargs={"temperature": 0, "max_new_tokens": 300}
)

llm = hf 
llm.invoke(query)

In [48]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
 """

In [49]:
prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])

In [50]:
retrievalQA=RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
)

In [51]:
query="How does one select the TOP N rows from a table?"

In [None]:
# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])