# Load PDF and indexing

In [100]:
from langchain_community.document_loaders import PyMuPDFLoader

In [101]:
loader = PyMuPDFLoader('introduction to apple.pdf')
docs = loader.load()

In [102]:
print(docs[0].page_content)

Introduction of Apple. Inc.
 
Apple Inc.
 
Is an American multinational corporation, one of the greatest in the world that designs and 
manufactures consumer electronics and computer software products? The company's best-known
hardware products Macintosh computers, iPod, iPhone, software’s including the Mac OS X 
operating system, iTunes and other creativity software’s like iWork represent the face of the 
music, phone, and computing industry. The company operates more than 250 retail stores in 
sixteen countries and an online store where hardware and software products are sold.
 
Established in Cupertino, California on April 1, 1976 and incorporated January 3, 1977, the 
company was called Apple Computer, Inc. for its Rest 30 years, but dropped the word 
"Computer" on January 9, 2007 to reject the company's ongoing expansion into the consumer 
electronics market in addition to its traditional focus on personal computers. Apple has about 
42,800 employees worldwide and had worldwide an

In [103]:
from langchain.embeddings import HuggingFaceBgeEmbeddings
import torch 

model_name = "BAAI/bge-small-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'},
    encode_kwargs=encode_kwargs
)

# Retriever

## Hybrid Search

### Parent child retriever - Dense retriever

In [104]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma


vectorstore = Chroma(
    collection_name="full_documents",
    embedding_function=bge_embeddings  #OpenAIEmbeddings()
)

parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

store = InMemoryStore()

parent_child_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

parent_child_retriever.add_documents(docs)

In [105]:
len(list(store.yield_keys()))

14

### BM25 retriever - Sparse retriever

In [106]:
from langchain.retrievers import BM25Retriever

bm25_retriever = BM25Retriever.from_documents(docs)
bm25_retriever.k = 2

### Ensemble Retriever

In [107]:
from langchain.retrievers import EnsembleRetriever

ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, parent_child_retriever],
                                       weights=[0.5, 0.5])

In [108]:
ensemble_retriever.get_relevant_documents("Steve Jobs returned to his company as CEO in 1997")

[Document(metadata={'source': 'introduction to apple.pdf', 'file_path': 'introduction to apple.pdf', 'page': 1, 'total_pages': 7, 'format': 'PDF 1.4', 'title': '', 'author': 'ACER-PC', 'subject': '', 'keywords': '', 'creator': 'Writer', 'producer': 'LibreOffice 4.2', 'creationDate': "D:20161211014318Z'", 'modDate': '', 'trapped': ''}, page_content="the iPhone’s success). One problem was that Apple allowed Motorola, Sharp, and Digital Ocean \nto manufacture devices that ran the Newton OS and ultimately the lack of uniformity may have \nbeen a turnoff to adopters.\n1990s, The Coming back of Steve Jobs\n \nWhen Steve Jobs returned to his company as CEO in 1997, he had learned from his mistakes, as \nwell as from Apple's mistakes over the previous decade. One of Jobs’ best decisions upon his \nreturn was to stop cloning Macintosh computers. Since 1995, Apple had been allowing the other \ncompanies to make Mac-compatible computers. The arrangement was cutting into Apple's \nbottom line and 

## HyDE

In [109]:
def remove_null(x):
    return [i for i in x if i]

In [110]:
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
llm = ChatOpenAI(model="gpt-3.5-turbo")
promt_template = """
Please write 4 scientific paper passages to answer the question
Question: {question}
Passage:
"""
promt = PromptTemplate.from_template(promt_template)

generate_hypothetical_docs = (
    promt 
    | llm 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
    | (lambda x: remove_null(x))
)

In [111]:
query = "what is apple ?"
results = generate_hypothetical_docs.invoke({"question": query})

In [112]:
results

['1. Apples are fruits that belong to the Rosaceae family, which also includes other popular fruits such as pears and peaches. They are widely cultivated in temperate regions around the world and are known for their crisp texture and sweet flavor.',
 '2. The apple tree, known scientifically as Malus domestica, is a deciduous tree that can grow up to 30 feet tall. It produces fruits that vary in color, size, and taste depending on the variety of the apple.',
 '3. Apples are a rich source of essential nutrients such as vitamin C, fiber, and antioxidants. These nutrients have been linked to various health benefits, including reduced risk of chronic diseases such as heart disease and diabetes.',
 '4. The genetic diversity of apples is vast, with thousands of different varieties cultivated worldwide. Each variety has its own unique characteristics, making apples a versatile fruit that can be enjoyed in a variety of ways, from eating fresh to cooking and baking.']

## Contextual compression retriever

In [113]:
# retriever =  (generate_hypothetical_docs | ensemble_retriever)

In [114]:
# from langchain.llms import OpenAI
# from langchain.retrievers import ContextualCompressionRetriever
# from langchain.retrievers.document_compressors import LLMChainExtractor

# # making the compressor
# llm = OpenAI(temperature=0)
# compressor = LLMChainExtractor.from_llm(llm)

# # it needs a base retriever (we're using FAISS Retriever) and a compressor (Made above)
# compression_retriever = ContextualCompressionRetriever(base_compressor=compressor,
#                                                        base_retriever=ensemble_retriever)

In [115]:
# compressed_docs = compression_retriever.invoke({"question":"what was year apple without Steve Jobs?"})

In [116]:
# compressed_docs

## Fusion ranking


In [117]:
from langchain.load import dumps, loads


def reciprocal_rank_fusion(results: list[list], k=60):
    fused_scores = {}
    for docs in results:
        # Assumes the docs are returned in sorted order of relevance
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            previous_score = fused_scores[doc_str]
            fused_scores[doc_str] += 1 / (rank + k)

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    return reranked_results

In [118]:
ragfusion_chain = generate_hypothetical_docs | ensemble_retriever.map() | reciprocal_rank_fusion 

In [119]:
original_query = "what was year apple without Steve Jobs?"

In [120]:
results = ragfusion_chain.invoke({"question": original_query})

In [121]:
retriever = ragfusion_chain

# Generation

In [122]:
import os
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv())

True

In [123]:
from langchain.prompts import PromptTemplate
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context
{context}
Question:{question}
Helpful Answers:
 """
prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])

In [124]:
from huggingface_hub import login
login(token='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')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\chung\.cache\huggingface\token
Login successful


In [125]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-3.5-turbo")

In [127]:
question = "what was year apple without Steve Jobs?"

In [128]:
from operator import itemgetter
final_rag_chain = (
    {"context": retriever,
     "question": itemgetter("question")}
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

'Apple was without Steve Jobs during the 1980s.'