In [1]:
from haystack.utils import convert_files_to_docs
#env + system imports
from dotenv import load_dotenv
import os
#pincone
import pinecone  
from haystack.document_stores import PineconeDocumentStore
from haystack.pipelines import DocumentSearchPipeline
from haystack.nodes import PromptNode, PromptTemplate, AnswerParser, EmbeddingRetriever
from haystack import Pipeline
from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor
from haystack import Document




In [2]:
load_dotenv(override=True)
# Access the API key
pinecone_api_key = os.getenv("PINECONE_API_KEY")
huggingface_api_token = os.getenv("HUGGING_FACE_API_TOKEN")


In [3]:
index_name='haystack'
pinecone.init(      
api_key=pinecone_api_key,      
environment='gcp-starter'      
)      
index = pinecone.Index(index_name=index_name)

#Initialize the haystack document store object
document_store = PineconeDocumentStore(
api_key=pinecone_api_key,
pinecone_index=index,
similarity="cosine",
embedding_dim=768
)

In [4]:
doc_dir="./Textbooks/CrackingTheCodingInterview.pdf"
# all_docs = convert_files_to_docs(dir_path=doc_dir)
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
doc_pdf = converter.convert(file_path=doc_dir, meta=None)[0]

from haystack.nodes import PreProcessor
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=False,
split_by="word",
split_length=250,
split_respect_sentence_boundary=True,    #prevents sentences from being cut off
)
docs = preprocessor.process([doc_pdf])
print(f"n_docs_output: {len(docs)}")

Preprocessing:   0%|                                                                                  | 0/1 [00:00<?, ?docs/s]We found one or more sentences whose split count is higher than the split length.
Preprocessing: 100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.52docs/s]

n_docs_output: 1096





In [5]:
from haystack.nodes import EmbeddingRetriever
retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="flax-sentence-embeddings/all_datasets_v3_mpnet-base",
    model_format="sentence_transformers",
    top_k=2
)

  return self.fget.__get__(instance, owner)()


In [6]:
batch_size = 256
total_doc_count = len(docs)

counter = 0
embedded_Docs = []
for doc in docs:

    embedded_Docs.append(doc)
    counter += 1
    if counter % batch_size == 0 or counter == total_doc_count:
        embeds = retriever.embed_documents(embedded_Docs)
        for i, doc in enumerate(embedded_Docs):
            doc.embedding = embeds[i]
        document_store.write_documents(embedded_Docs)
        embedded_Docs.clear()
    if counter == total_doc_count:
        break


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents: 100%|███████████████████████████████████████████████████████████████████| 256/256 [00:01<00:00, 200.96it/s]


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents: 100%|███████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 299.64it/s]


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents: 100%|███████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 260.40it/s]


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Writing Documents: 100%|███████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 273.14it/s]


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Writing Documents: 128it [00:00, 400.74it/s]                                                                                  
