In [None]:
#Issues to address
#figure out optimal stride and text_length
#

In [1]:
import pinecone
from tqdm import tqdm
from dotenv import load_dotenv
import os

# Load environment variables from .env file
#overide = true just forces a reload on the .env file in case api key changes
load_dotenv(override=True)

# Access the API key
api_key = os.getenv("PINECONE_API_KEY")


  from tqdm.autonotebook import tqdm


In [2]:
#Initialize the pinecone index
import pinecone      

pinecone.init(      
	api_key=api_key,      
	environment='gcp-starter'      
)      
index = pinecone.Index('haystack')

In [3]:
#load a pinecone document store object with the index defined previously
from haystack.document_stores import PineconeDocumentStore

document_store = PineconeDocumentStore(
    api_key=api_key,
    pinecone_index=index,
    similarity="cosine",
    embedding_dim=768
)

In [None]:
#Textbook Extraction to make digestable for docu store
from TextBookExctraction import Process_PDF
#max_chunk_length is the max token length of each vector within the database
#stride refers to the step taken to find the middle of each vector. 
#If stride is 2 and if max_length is 3, we move 2 steps forwards and each vector will contain 3 tokens with an overlap of 1
# [1,2,3] , [3,4,5], [5,6,7], ... , [n-1,n,n+1]            with each array referring to a chunk/vector
pdf_processor = Process_PDF(pdf_path="./Textbooks/CrackingTheCodingInterview.pdf")
text = pdf_processor.extract_text_from_pdf()
cleaned_text = pdf_processor.preprocess_text(text)
text_chunks = pdf_processor.segment_text(cleaned_text, max_chunk_length=500, stride=400)
print(len(text_chunks))

In [4]:
import torch
#Initialize retriever model
from haystack.nodes import EmbeddingRetriever
retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="flax-sentence-embeddings/all_datasets_v3_mpnet-base",
    model_format="sentence_transformers",
)

# import torch
# #Initialize retriever model
# from haystack.nodes import EmbeddingRetriever
# retriever = EmbeddingRetriever(
#     document_store=document_store,
#     embedding_model="flax-sentence-embeddings/all_datasets_v3_mpnet-base",
#     model_format="sentence_transformers",
#     top_k=2
# )

  return self.fget.__get__(instance, owner)()


In [None]:
from haystack import Document

batch_size = 256
total_doc_count = len(text_chunks)

counter = 0
docs = []
for d in text_chunks:
    doc = Document(
        content = d
    )
    docs.append(doc)
    counter += 1
    if counter % batch_size == 0 or counter == total_doc_count:
        embeds = retriever.embed_documents(docs)
        for i, doc in enumerate(docs):
            doc.embedding = embeds[i]
        document_store.write_documents(docs)
        docs.clear()
    if counter == total_doc_count:
        break


In [5]:
#example usage of retriever
from haystack.pipelines import DocumentSearchPipeline
from haystack.utils import print_documents

search_pipe = DocumentSearchPipeline(retriever)
result = search_pipe.run(
    query="what happens during a coding interview?",
    params={"Retriever": {"top_k": 2}}
)
print(type(result))

content_pieces = [document.content for document in result['documents']]
print(content_pieces[1])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

<class 'dict'>
s to explore what areas of technology you're familiar with. 
Next, you fly to Seattle (or whichever office you're interviewing for) for four or five interviews with one or 
two teams that have selected you based on your resume and phone interviews. You will have to code on a 
whiteboard, and some interviewers will stress other skills. Interviewers are each assigned a specific area to 
probe and may seem very different from each other. They cannot see the other feedback until they have 
submitted


In [10]:
from haystack.nodes import PromptNode, PromptTemplate, AnswerParser

lfqa_prompt = PromptTemplate(
    prompt="""Synthesize a comprehensive answer from the following text for the given question.
                             Provide a clear and concise response that summarizes the key points and information presented in the text.
                             Your answer should be in your own words and be no longer than 50 words.
                             \n\n Related text: {join(documents)} \n\n Question: {query} \n\n Answer:""",
                             output_parser=AnswerParser(),
)

prompt_node = PromptNode( default_prompt_template=lfqa_prompt)


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [9]:
print(Document(content_pieces[0]))

<Document: id=3a4a4200b73a0aab07f1c7cbd9caa735, content='(and many other companies). algorithm and coding problems form the 
largest component of the intervi...'>


In [11]:
from haystack.pipelines import Pipeline
from haystack.schema import Document
pipeline = Pipeline()

# pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["Query"])
output = pipeline.run(query="what happens during a coding interview?", documents=[Document(content_pieces[0]),Document(content_pieces[1])])


In [12]:
[a.answer for a in output["answers"]]

['You will have to code on a whiteboard, and some interviewers will stress other skills.']