# DatabaseManager

> This will be the interface between an application and the databse.  

> In the immediate, it will initialise the database, read the Travel Advice JSON, ingest it via chunks into the database and it will perform searches given an embedding.

In [None]:
#| default_exp DatabaseManager

In [None]:
%pip install pymilvus

Collecting pymilvus
  Using cached pymilvus-2.3.3-py3-none-any.whl.metadata (4.3 kB)
Collecting grpcio<=1.58.0,>=1.49.1 (from pymilvus)
  Downloading grpcio-1.58.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Collecting protobuf>=3.20.0 (from pymilvus)
  Downloading protobuf-4.25.1-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting environs<=9.5.0 (from pymilvus)
  Using cached environs-9.5.0-py2.py3-none-any.whl (12 kB)
Collecting ujson>=2.0.0 (from pymilvus)
  Downloading ujson-5.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.7 kB)
Collecting minio>=7.0.0 (from pymilvus)
  Using cached minio-7.2.0-py3-none-any.whl.metadata (4.3 kB)
Collecting marshmallow>=3.0.0 (from environs<=9.5.0->pymilvus)
  Using cached marshmallow-3.20.1-py3-none-any.whl.metadata (7.8 kB)
Collecting python-dotenv (from environs<=9.5.0->pymilvus)
  Using cached python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Collecting pycryptodome (from minio>

In [None]:
#| export
import json
from bs4 import BeautifulSoup
import re
import pymilvus

from pymilvus import Collection, CollectionSchema, FieldSchema, DataType

'''

%pip install -qU \
  tiktoken==0.4.0 \
  openai==0.27.7 \
  langchain==0.0.179 \
  pinecone-client==2.2.1 \
  datasets==2.13.1 \
  cohere
'''

'\n\n%pip install -qU   tiktoken==0.4.0   openai==0.27.7   langchain==0.0.179   pinecone-client==2.2.1   datasets==2.13.1   cohere\n'

In [None]:
#| export
class DatabaseManager:
    def __init__(self, config):
        """
        Initialize the DatabaseManager with configuration settings.
        :param config: Configuration details for database connection and other settings.
        """
        self.config = config
        # Initialize database connection here



    def chunk_json(self, json_data):
        """
        Divide the JSON data into manageable chunks.
        :param json_data: The parsed JSON data.
        :return: List of chunks.
        """
        # Implement chunking logic here
        chunks = []
        return chunks

    def embed_chunks(self, chunks):
        """
        Create embeddings for each chunk of data.
        :param chunks: List of data chunks.
        :return: List of embedded chunks.
        """
        # Implement embedding logic here
        embedded_chunks = []
        return embedded_chunks

    def initialize_database(self):
        """
        Set up the Milvus database, including connection and schema.
        """
        # Implement database initialization here

    def store_in_milvus(self, embedded_chunks):
        """
        Store embedded chunks in the Milvus database.
        :param embedded_chunks: List of embedded chunks.
        """
        # Implement storage logic here

    def search_database(self, query, k):
        """
        Search the database for K nearest chunks based on the query embedding.
        :param query: Search query.
        :param k: Number of nearest chunks to find.
        :return: Search results.
        """
        # Implement search logic here

    def retrieve_data(self, search_results):
        """
        Fetch chunk data and metadata based on search results.
        :param search_results: Results from the database search.
        :return: Corresponding data and metadata.
        """
        # Implement data retrieval logic here

    def __del__(self):
        """
        Cleanup when an instance is destroyed, like closing database connections.
        """
        # Implement cleanup logic here
    
    def ingest_json(self, file_path):
        """
        Read and parse a JSON file.
        :param file_path: Path to the JSON file.
        :return: Parsed JSON data.
        """
        with open(file_path, 'r') as file:
            data = json.load(file)
        return data



## Configure the database

I'm not sure how we'll do it in the future.

For time being, we are going to do two things now
1. ingest the travel advice json
2. turn that into a local database
   





In [None]:
config = {
    'travel_file':  "./ingest_data_sources/travel-advice-all-countries.json"
}

db_manager = DatabaseManager(config)


### Ingesting

Here's what I'm doing

1. We load the travel advice json into memory (data)
2. We loop around it to turn it into a new structure that better fits our needs.

   Each item will have this structure:
   ```json
   {
       "url": "/foreign-travel-advice/british-indian-ocean-territory",
       "country_name": "British Indian Ocean Territory",
       "content_title": "Summary",
       "part_id": 0,
       "content": "Before you travel, check the 'Entry requirements'....",
       "content_html": "\n<div class=\"example\">\n<p>Before you travel, ..."
   }

3. Note that for any individual country there are several "parts" - this is why we have so much metadata. It's so we can pull back each chunk.
4.  We save the new formatted json as chunk_json_format.json in case we want to skip all this in the future - for time being, I'm assuming a 'dump' of this may change.


In [None]:
data = db_manager.ingest_json(db_manager.config['travel_file'])

new_data_structure = []
for element in data:
    base_path = element["countryInfo"]["base_path"]
    country_name = element["countryInfo"]["details"]["country"]["name"]
    for index, part in enumerate(element["countryInfo"]["details"]["parts"]):
        html_content = part["body"]
        soup = BeautifulSoup(html_content, 'html.parser')
        text_content = soup.get_text()

        # Replace newlines and tabs with a space, and strip leading/trailing whitespaces
        text_content = re.sub(r'\s+', ' ', text_content).strip()

        # Replace the Unicode characters I found with ASCII equivalents (maybe there are others)
        replacements = {
            '\u2018': "'", '\u2019': "'",  # Single quotes
            '\u201c': '"', '\u201d': '"',  # Double quotes
            '\u2026': '...',               # Ellipsis (add anything else as required)
        }
        for unicode_char, ascii_char in replacements.items():
            text_content = text_content.replace(unicode_char, ascii_char)

        new_element = {
            "url": base_path,
            "country_name": country_name,
            "content_title": part["title"],
            "part_id": index,
            "content": text_content,
            "content_html": html_content
        }
        new_data_structure.append(new_element)

new_json_string = json.dumps(new_data_structure, indent=4)

file_name = "chunk_json_format.json"
with open(file_name, 'w', encoding='utf-8') as file:
    file.write(new_json_string)

#print(new_json_string[:5000])  # Adjust the slice as needed

## Milvus Stuff

1. Let's start by connecting to milvus and creating an appropritate collection


In [None]:

connections.connect(
  alias="default",
  user='username',
  password='password',
  host='127.0.0.1',
  port='19530'
)


VECTOR_DIM = # [set the dimension of your vectors here]

fields = [
    FieldSchema(name="document_id", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="content_vector", dtype=DataType.FLOAT_VECTOR, dim=VECTOR_DIM)
]

schema = CollectionSchema(fields, description="Collection for document embeddings with metadata in JSON")
collection_name = "your_document_collection"
document_collection = Collection(name=collection_name, schema=schema)





SyntaxError: invalid syntax (352051605.py, line 11)

In [None]:

tokenizer_name = tiktoken.encoding_for_model('gpt-4')
tokenizer_name.name
tokenizer = tiktoken.get_encoding(tokenizer_name.name)


openai.api_key = openai_key
if debug:
  print (openai.Engine.list())  # check we have authenticated

embed_model = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], engine=embed_model
)

In [None]:
connections.disconnect("default")


In [None]:
#| hide
# Leave this to the bottom so we auto-export code
import nbdev; nbdev.nbdev_export()