In [33]:
import os
import boto3
import pickle
import json
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import PreProcessor, EmbeddingRetriever
from haystack.utils import convert_files_to_docs


def load_json_file(file_path) -> dict:
    """Load a local JSON file into a dictionary in Python.

    Parameters:
        file_path (str): The path to the JSON file to load.

    Returns:
        dict: A dictionary containing the contents of the JSON file.

    Raises:
        FileNotFoundError: If the specified file path does not exist.
        ValueError: If the file contents cannot be parsed as valid JSON.
    """
    try:
        # Open the JSON file and load its contents into a Python dictionary
        with open(file_path, 'r') as f:
            data = json.load(f)
        return data
    except FileNotFoundError:
        raise FileNotFoundError(f"File not found: {file_path}")
    except ValueError:
        raise ValueError(f"Invalid JSON in file: {file_path}")


def load_json_from_s3(bucket: str, key: str) -> dict:
    """Loads a JSON file from S3 bucket.
    
    Args:
        bucket (str): S3 bucket containing JSON file
        key (str): Path within bucket of JSON file
        
    Returns:
        dict
    """
    content_object = S3_RESOURCE.Object(bucket, key)
    file_content = content_object.get()["Body"].read().decode("utf-8")
    return json.loads(file_content)


S3_RESOURCE = boto3.resource('s3')
S3_CONFIG = load_json_file('../s3_config.json')
OPENAI_API_KEY = load_json_from_s3(bucket=S3_CONFIG['S3_BUCKET'], key=S3_CONFIG["OPENAI_API_S3_KEY"])['Key']

In [38]:
doc_dir = "data"
files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]

# Set up document store that splits the documents into segments
split_doc_store = InMemoryDocumentStore(embedding_dim=1536)

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=200,
    split_overlap=10,
    split_respect_sentence_boundary=True,
)

raw_docs = convert_files_to_docs(dir_path=doc_dir)
proc_docs = preprocessor.process(raw_docs)
split_doc_store.write_documents(proc_docs)

Preprocessing:   0%|          | 0/11 [00:00<?, ?docs/s]

In [7]:
whole_doc_store = InMemoryDocumentStore(embedding_dim=1536)
whole_docs = convert_files_to_docs(dir_path=doc_dir)
whole_doc_store.write_documents(whole_docs)

In [40]:
# OpenAI EmbeddingRetriever
split_doc_retriever = EmbeddingRetriever(
    document_store=split_doc_store,
    batch_size=8,
    embedding_model="text-embedding-ada-002",
    api_key=OPENAI_API_KEY,
    max_seq_len=8192,
    top_k=4
)

split_doc_store.update_embeddings(split_doc_retriever)

with open('split_doc_store.pkl', 'wb') as f:
    pickle.dump(split_doc_store, f)

Updating Embedding:   0%|          | 0/77 [00:00<?, ? docs/s]

Calculating embeddings:   0%|          | 0/10 [00:00<?, ?it/s]

In [9]:
whole_doc_retriever = EmbeddingRetriever(
    document_store=whole_doc_store,
    batch_size=8,
    embedding_model="text-embedding-ada-002",
    api_key=OPENAI_API_KEY,
    max_seq_len=8192,
    top_k=1
)

whole_doc_store.update_embeddings(whole_doc_retriever)

with open('whole_doc_store.pkl', 'wb') as f:
    pickle.dump(whole_doc_store, f)

Updating Embedding:   0%|          | 0/11 [00:00<?, ? docs/s]

Calculating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]