# Installing libraries

In [49]:
# pip install lark
# pip install chromadb
# pip install langchain-cohere
# pip install pypdf
# pip install langchain
# pip install -U langchain-community
# pip install scikit-learn
# pip install "langchain[docarray]"

## Importing libraries

In [50]:
import os
import sys
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_cohere import CohereEmbeddings
from langchain.vectorstores import Chroma

In [51]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain_cohere import ChatCohere
from langchain_core.messages import HumanMessage
from langchain.retrievers import SVMRetriever
from langchain.retrievers import TFIDFRetriever
from langchain.chains import RetrievalQA

In [52]:
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.vectorstores import DocArrayInMemorySearch


In [53]:
API_KEY=""
persist_directory = 'chroma/'

### Loading documents -> PDF

In [54]:

loader = PyPDFLoader("PromptNER : Prompting For Named Entity Recognition.pdf")
pages = loader.load()

In [55]:
len(pages)

20

In [56]:
page = pages[0]

In [57]:
type(page)

langchain_core.documents.base.Document

In [58]:
page.metadata

{'source': 'PromptNER : Prompting For Named Entity Recognition.pdf', 'page': 0}

### Document Splitting

In [59]:
chunk_size =26
chunk_overlap = 4

In [60]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [61]:
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

In [62]:
docs = text_splitter.split_documents(pages)

### Embeddings

In [63]:
sentence1 = "i like cats"

In [64]:
embeddings_model = CohereEmbeddings(cohere_api_key=API_KEY, model="embed-english-light-v3.0")

In [65]:
query_result = embeddings_model.embed_query(sentence1)
print(query_result)


[0.045532227, -0.09069824, 0.03414917, 0.04711914, 0.00022697449, -0.016235352, 0.055419922, -0.003786087, -0.035980225, -0.006427765, -0.03363037, -0.071899414, -0.0016222, 0.090148926, 0.02168274, 0.052246094, 0.029556274, -0.124816895, -0.06903076, -0.025985718, -0.15734863, 0.00447464, 0.028442383, -0.014556885, -0.060577393, -0.007949829, -0.09069824, -0.013214111, -0.06762695, 0.118896484, -0.09942627, -0.059448242, 0.03643799, 0.009109497, 0.02128601, 0.0028762817, 0.0023822784, -0.004989624, 0.07421875, -0.041778564, -0.041168213, -0.048675537, -0.020690918, -0.03567505, -0.04574585, 0.003660202, 0.03656006, -0.079711914, 0.099853516, 0.01828003, -0.017623901, 0.018814087, 0.0007200241, -0.01108551, -0.017669678, 0.07543945, 0.019836426, 0.10284424, -0.037139893, -0.057159424, -0.051513672, -0.011436462, -0.052246094, 0.07513428, -0.020385742, 0.015136719, 0.011054993, -0.014656067, -0.018722534, 0.0024700165, 0.017990112, 0.019424438, -0.0024738312, 0.038604736, 0.05517578, 0.

In [66]:
pip show hnswlib


Name: hnswlib
Version: 0.8.0
Summary: hnswlib
Home-page: https://github.com/yurymalkov/hnsw
Author: Yury Malkov and others
Author-email: 
License: 
Location: /home/elaheh/miniconda3/envs/myenv/lib/python3.12/site-packages
Requires: numpy
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [67]:
#pip install chromadb==0.4.3


In [68]:
#pip show chromadb


In [69]:
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embeddings_model,
    persist_directory=persist_directory
)

In [70]:
print(vectordb._collection.count())

128


In [71]:
question = "what is NER?"

### Retrieval

In [72]:
retreived_docs = vectordb.similarity_search(question,k=5)

In [None]:
retreived_docs 

In [74]:
for doc in retreived_docs:
    print(doc.metadata)

{'page': 1, 'source': 'PromptNER : Prompting For Named Entity Recognition.pdf'}
{'page': 1, 'source': 'PromptNER : Prompting For Named Entity Recognition.pdf'}
{'page': 7, 'source': 'PromptNER : Prompting For Named Entity Recognition.pdf'}
{'page': 7, 'source': 'PromptNER : Prompting For Named Entity Recognition.pdf'}
{'page': 1, 'source': 'PromptNER : Prompting For Named Entity Recognition.pdf'}


In [75]:
retreived_docs_MMR = vectordb.max_marginal_relevance_search(question,k=2, fetch_k=3)

In [None]:
retreived_docs_MMR

In [77]:
# Working with metadata

retreived_docs3 = vectordb.similarity_search(
    question,
    k=3,
    filter={"source": "PromptNER : Prompting For Named Entity Recognition.pdf"}
)
for d in retreived_docs3:
    print(d.metadata)

{'page': 1, 'source': 'PromptNER : Prompting For Named Entity Recognition.pdf'}
{'page': 1, 'source': 'PromptNER : Prompting For Named Entity Recognition.pdf'}
{'page': 7, 'source': 'PromptNER : Prompting For Named Entity Recognition.pdf'}


In [78]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The lecture the chunk is from, should be from 'PromptNER : Prompting For Named Entity Recognition.pdf'",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the paper",
        type="integer",
    ),
]

In [79]:
document_content_description = "NER"
llm = ChatCohere(cohere_api_key= API_KEY)
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [80]:
retreived_docs4 = retriever.get_relevant_documents(question)
for d in retreived_docs4:
    print(d.metadata)

{'page': 1, 'source': 'PromptNER : Prompting For Named Entity Recognition.pdf'}
{'page': 1, 'source': 'PromptNER : Prompting For Named Entity Recognition.pdf'}
{'page': 7, 'source': 'PromptNER : Prompting For Named Entity Recognition.pdf'}
{'page': 7, 'source': 'PromptNER : Prompting For Named Entity Recognition.pdf'}


In [81]:
all_page_text=[p.page_content for p in pages]
joined_page_text=" ".join(all_page_text)

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)
splits = text_splitter.split_text(joined_page_text)

In [82]:
svm_retriever = SVMRetriever.from_texts(splits,embeddings_model)
tfidf_retriever = TFIDFRetriever.from_texts(splits)

In [None]:
docs_svm=svm_retriever.get_relevant_documents(question)
docs_svm[0]
docs_tfidf=tfidf_retriever.get_relevant_documents(question)
docs_tfidf[0]

### Question Answering

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)
result = qa_chain({"query": question})
result["result"]

### Prompting

In [85]:
# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [86]:
question2= "what is in-context learning"

In [87]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [88]:
result = qa_chain({"query": question})

In [89]:
result["result"]

'NER stands for Named Entity Recognition. Thanks for asking!'

In [90]:
result2 = qa_chain({"query": question2})
result2["result"]

'In-context learning is a strategy used with LLMs to improve their performance on few-shot learning tasks by providing examples in the prompt or context window of the input. \n\nThanks for asking!'

In [None]:
result["source_documents"][0]

### RetrievalQA chain types

In [92]:
qa_chain_mr = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="map_reduce"
)
result = qa_chain_mr({"query": question})
result["result"]

'NER stands for Named Entity Recognition.'

### Chat

In [93]:

memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [94]:
retriever=vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory
)

In [95]:
result = qa({"question": question})
result['answer']

'NER stands for Named Entity Recognition. It is a task in natural language processing that involves identifying and classifying named entities in text into predefined categories such as person names, organizations, locations, dates, and more.'