In [None]:
#Issues to address
#figure out optimal stride and text_length
#apply namespaces
#manage which textbooks are known
#build actual pipeline

In [1]:
import pinecone
from tqdm import tqdm
from dotenv import load_dotenv
import os

# Load environment variables from .env file
#overide = true just forces a reload on the .env file in case api key changes
load_dotenv(override=True)

# Access the API key
pinecone_api_key = os.getenv("PINECONE_API_KEY")
huggingface_api_token = os.getenv("HUGGING_FACE_API_TOKEN")



  from tqdm.autonotebook import tqdm


In [2]:
#Initialize the pinecone index
import pinecone      

pinecone.init(      
	api_key=pinecone_api_key,      
	environment='gcp-starter'      
)      
index = pinecone.Index('haystack')

In [3]:
#load a pinecone document store object with the index defined previously
from haystack.document_stores import PineconeDocumentStore

document_store = PineconeDocumentStore(
    api_key=pinecone_api_key,
    pinecone_index=index,
    similarity="cosine",
    embedding_dim=768
)

In [None]:
#Textbook Extraction to make digestable for docu store
from TextBookExctraction import Process_PDF
#max_chunk_length is the max token length of each vector within the database
#stride refers to the step taken to find the middle of each vector. 
#If stride is 2 and if max_length is 3, we move 2 steps forwards and each vector will contain 3 tokens with an overlap of 1
# [1,2,3] , [3,4,5], [5,6,7], ... , [n-1,n,n+1]            with each array referring to a chunk/vector
pdf_processor = Process_PDF(pdf_path="./Textbooks/CrackingTheCodingInterview.pdf")
text = pdf_processor.extract_text_from_pdf()
cleaned_text = pdf_processor.preprocess_text(text)
text_chunks = pdf_processor.segment_text(cleaned_text, max_chunk_length=500, stride=400)
print(len(text_chunks))

In [4]:
import torch
#Initialize retriever model
from haystack.nodes import EmbeddingRetriever
retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="flax-sentence-embeddings/all_datasets_v3_mpnet-base",
    model_format="sentence_transformers",
    top_k=3
)

# import torch
# #Initialize retriever model
# from haystack.nodes import EmbeddingRetriever
# retriever = EmbeddingRetriever(
#     document_store=document_store,
#     embedding_model="flax-sentence-embeddings/all_datasets_v3_mpnet-base",
#     model_format="sentence_transformers",
#     top_k=2
# )

  return self.fget.__get__(instance, owner)()


In [5]:
query = "What kind of Big O notation is used during a coding interview?"

In [6]:
from haystack.pipelines import DocumentSearchPipeline

search_pipe = DocumentSearchPipeline(retriever=retriever)


In [6]:
from haystack import Pipeline
pipe = Pipeline()
pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
res = pipe.run(query=query)
print(res)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'documents': [<Document: {'content': "(and many other companies). algorithm and coding problems form the \nlargest component of the interview process. Think of these as problem-solving questions. The interviewer \nis looking to evaluate your ability to solve algorithmic problems you haven't seen before. \nVery often, you might get through only one question in an interview. Forty-five minutes is not a long time, \nand it's difficult to get through several different questions in that time frame. \nYou should do your best to talk out loud t", 'content_type': 'text', 'score': 0.8446781335, 'meta': {'doc_type': 'vector'}, 'id_hash_keys': ['content'], 'embedding': None, 'id': '3a4a4200b73a0aab07f1c7cbd9caa735'}>, <Document: {'content': "s to explore what areas of technology you're familiar with. \nNext, you fly to Seattle (or whichever office you're interviewing for) for four or five interviews with one or \ntwo teams that have selected you based on your resume and phone interviews. You wil

In [None]:
from haystack import Document

batch_size = 256
total_doc_count = len(text_chunks)

counter = 0
docs = []
for d in text_chunks:
    doc = Document(
        content = d
    )
    docs.append(doc)
    counter += 1
    if counter % batch_size == 0 or counter == total_doc_count:
        embeds = retriever.embed_documents(docs)
        for i, doc in enumerate(docs):
            doc.embedding = embeds[i]
        document_store.write_documents(docs)
        docs.clear()
    if counter == total_doc_count:
        break


In [7]:
from haystack import Pipeline
from haystack.nodes import PromptNode, PromptTemplate, AnswerParser
pipe= Pipeline()


prompt ="""Synthesize a comprehensive answer from the following top_k most relevant paragraphs and the given question. 
                             Provide a clear and concise response that summarizes the key points and information presented in the paragraphs. 
                             Your answer should be in your own words and be no longer than 50 words. 
                             \n\n Paragraphs: {join(documents)} \n\n Question: {query} \n\n Answer:"""
template = PromptTemplate(prompt=prompt,output_parser=AnswerParser())
node = PromptNode(model_name_or_path="mistralai/Mistral-7B-v0.1",default_prompt_template=template,api_key=huggingface_api_token)



In [8]:

from haystack.nodes import PromptNode, PromptTemplate, AnswerParser
from haystack import Document

pipe.add_node(component=node,name="prompt_node",inputs=["Query"])


In [10]:
res=pipe.run(query=query,documents=search_pipe.run(query=query,params={"Retriever": {"top_k": 2}})['documents'])

#Produced Answer
print(res['answers'])
#Context used
print(search_pipe.run(query=query,params={"Retriever": {"top_k": 2}})['documents'])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[<Answer {'answer': ' \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_ids': ['ca2a24af3ea1d88021571f06009656ac', '3b1e1c8d8a57c665a4819626a8165814'], 'meta': {'prompt': "Synthesize a comprehensive answer from the following topk most relevant paragraphs and the given question. \n                             Provide a clear and concise response that summarizes the key points and information presented in the paragraphs. \n                             Your answer should be in your own words and be no longer than 50 words. \n                             \n\n Paragraphs:  O is closer to what academics mean by 0, in that it would be seen as incorrect to describe printing an \narray as O(N2). Industry would just say this is 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[<Document: {'content': " O is closer to what academics mean by 0, in that it would be seen as incorrect to describe printing an \narray as O(N2). Industry would just say this is O(N). \nFor this book, we will use big O in the way that industry tends to use it: By always trying to offer the tightest \ndescription of the runtime. \nBest Case, Worst Case, and Expected Case \nWe can actually describe our runtime for an algorithm in three different ways. \nCrackingTheCodinglnterview.com I 6th Edition \n39 \nVI I Big 0 \nLet's look a", 'content_type': 'text', 'score': 0.827158302, 'meta': {'doc_type': 'vector'}, 'id_hash_keys': ['content'], 'embedding': None, 'id': 'ca2a24af3ea1d88021571f06009656ac'}>, <Document: {'content': "n this. Some of the most common ones are O(log N), O(N log N), \nO(N), O(N2) and 0( 2N). There's no fixed list of possible runtimes, though. \nYou can also have multiple variables in your runtime. For example, the time to paint a fence that's w meters \nwide and h mete

In [None]:
from haystack.pipelines import Pipeline
from haystack.schema import Document
pipeline = Pipeline()

# pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["Query"])
output = pipeline.run(query="what happens during a coding interview?", documents=[Document(content_pieces[0]),Document(content_pieces[1])])


In [None]:
[a.answer for a in output["answers"]]