# Load PDF and indexing

In [2]:
from langchain_community.document_loaders import PyMuPDFLoader
loader = PyMuPDFLoader('introduction to apple.pdf')
docs = loader.load()

In [3]:
print(docs[0].page_content)

Introduction of Apple. Inc.
 
Apple Inc.
 
Is an American multinational corporation, one of the greatest in the world that designs and 
manufactures consumer electronics and computer software products? The company's best-known
hardware products Macintosh computers, iPod, iPhone, software’s including the Mac OS X 
operating system, iTunes and other creativity software’s like iWork represent the face of the 
music, phone, and computing industry. The company operates more than 250 retail stores in 
sixteen countries and an online store where hardware and software products are sold.
 
Established in Cupertino, California on April 1, 1976 and incorporated January 3, 1977, the 
company was called Apple Computer, Inc. for its Rest 30 years, but dropped the word 
"Computer" on January 9, 2007 to reject the company's ongoing expansion into the consumer 
electronics market in addition to its traditional focus on personal computers. Apple has about 
42,800 employees worldwide and had worldwide an

In [4]:
from langchain.embeddings import HuggingFaceBgeEmbeddings
import torch 

model_name = "BAAI/bge-small-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'},
    encode_kwargs=encode_kwargs
)


  from tqdm.autonotebook import tqdm, trange


# Retriever

## Hybrid Search

### Parent child retriever - Dense retriever

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma


vectorstore = Chroma(
    collection_name="full_documents",
    embedding_function=bge_embeddings  #OpenAIEmbeddings()
)

parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

store = InMemoryStore()

parent_child_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

parent_child_retriever.add_documents(docs)

  vectorstore = Chroma(
  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [6]:
len(list(store.yield_keys()))

14

### BM25 retriever - Sparse retriever

In [7]:
from langchain.retrievers import BM25Retriever

bm25_retriever = BM25Retriever.from_documents(docs)
bm25_retriever.k = 2

### Ensemble Retriever

In [8]:
from langchain.retrievers import EnsembleRetriever

ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, parent_child_retriever],
                                       weights=[0.5, 0.5])

In [9]:
ensemble_retriever.get_relevant_documents("Steve Jobs returned to his company as CEO in 1997")

  ensemble_retriever.get_relevant_documents("Steve Jobs returned to his company as CEO in 1997")


[Document(metadata={'source': 'introduction to apple.pdf', 'file_path': 'introduction to apple.pdf', 'page': 1, 'total_pages': 7, 'format': 'PDF 1.4', 'title': '', 'author': 'ACER-PC', 'subject': '', 'keywords': '', 'creator': 'Writer', 'producer': 'LibreOffice 4.2', 'creationDate': "D:20161211014318Z'", 'modDate': '', 'trapped': ''}, page_content="the iPhone’s success). One problem was that Apple allowed Motorola, Sharp, and Digital Ocean \nto manufacture devices that ran the Newton OS and ultimately the lack of uniformity may have \nbeen a turnoff to adopters.\n1990s, The Coming back of Steve Jobs\n \nWhen Steve Jobs returned to his company as CEO in 1997, he had learned from his mistakes, as \nwell as from Apple's mistakes over the previous decade. One of Jobs’ best decisions upon his \nreturn was to stop cloning Macintosh computers. Since 1995, Apple had been allowing the other \ncompanies to make Mac-compatible computers. The arrangement was cutting into Apple's \nbottom line and 

## HyDE

In [10]:
def remove_null(x):
    return [i for i in x if i]

In [29]:
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
llm = ChatOpenAI(model="gpt-3.5-turbo")
promt_template = """
Please write 4 scientific paper passages to answer the question
Question: {question}
Passage:
"""
promt = PromptTemplate.from_template(promt_template)

generate_hypothetical_docs = (
    promt 
    | llm 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
    | (lambda x: [i for i in x if i])
)

In [30]:
query = "what is apple ?"
results = generate_hypothetical_docs.invoke({"question": query})

In [31]:
results

['1. Apples are a widely cultivated fruit that belong to the Rosaceae family and the Malus genus. They are known for their crisp texture, sweet flavor, and variety of colors ranging from red, yellow, and green. ',
 '2. The anatomy of an apple consists of several parts, including the skin, flesh, core, seeds, and stem. The skin is rich in antioxidants, while the flesh contains dietary fiber and vitamins. The core houses the seeds, which are capable of germinating into new apple trees.',
 '3. Apples are a rich source of nutrients, including vitamin C, potassium, and dietary fiber. They have been linked to several health benefits, such as reducing the risk of chronic diseases like heart disease, cancer, and diabetes. ',
 '4. The cultivation of apples dates back thousands of years, with a wide variety of cultivars being developed over time. Different types of apples have unique flavors, textures, and uses, making them a versatile fruit enjoyed in various culinary dishes and beverages world

## Contextual compression retriever

In [14]:
# retriever =  (generate_hypothetical_docs | ensemble_retriever)

In [15]:
# from langchain.llms import OpenAI
# from langchain.retrievers import ContextualCompressionRetriever
# from langchain.retrievers.document_compressors import LLMChainExtractor

# # making the compressor
# llm = OpenAI(temperature=0)
# compressor = LLMChainExtractor.from_llm(llm)

# # it needs a base retriever (we're using FAISS Retriever) and a compressor (Made above)
# compression_retriever = ContextualCompressionRetriever(base_compressor=compressor,
#                                                        base_retriever=ensemble_retriever)

In [16]:
# compressed_docs = compression_retriever.invoke({"question":"what was year apple without Steve Jobs?"})

In [17]:
# compressed_docs

## Fusion ranking


In [18]:
from langchain.load import dumps, loads


def reciprocal_rank_fusion(results: list[list], k=60):
    fused_scores = {}
    for docs in results:
        # Assumes the docs are returned in sorted order of relevance
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            previous_score = fused_scores[doc_str]
            fused_scores[doc_str] += 1 / (rank + k)

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    return reranked_results

In [19]:
ragfusion_chain = generate_hypothetical_docs | ensemble_retriever.map() | reciprocal_rank_fusion 

In [20]:
original_query = "what was year apple without Steve Jobs?"

In [21]:
results = ragfusion_chain.invoke({"question": original_query})

  (loads(doc), score)


In [22]:
retriever = ragfusion_chain

# Generation

In [23]:
import os
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv())

True

In [24]:
from langchain.prompts import PromptTemplate
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context
{context}
Question:{question}
Helpful Answers:
 """
prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])

In [25]:
from huggingface_hub import login
login(token='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')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\chung\.cache\huggingface\token
Login successful


In [26]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-3.5-turbo")

In [27]:
question = "what was year apple without Steve Jobs?"

In [28]:
from operator import itemgetter
final_rag_chain = (
    {"context": retriever,
     "question": itemgetter("question")}
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

'Apple was without Steve Jobs in the 1980s.'