# DatabaseManager

> This will be the interface between an application and the databse.  

> In the immediate, it will initialise the database, read the Travel Advice JSON, ingest it via chunks into the database and it will perform searches given an embedding.

In [None]:
#| default_exp DatabaseManager

In [None]:
#| export
import json
from bs4 import BeautifulSoup
import re

In [None]:
#| export
class DatabaseManager:
    def __init__(self, config):
        """
        Initialize the DatabaseManager with configuration settings.
        :param config: Configuration details for database connection and other settings.
        """
        self.config = config
        # Initialize database connection here



    def chunk_json(self, json_data):
        """
        Divide the JSON data into manageable chunks.
        :param json_data: The parsed JSON data.
        :return: List of chunks.
        """
        # Implement chunking logic here
        chunks = []
        return chunks

    def embed_chunks(self, chunks):
        """
        Create embeddings for each chunk of data.
        :param chunks: List of data chunks.
        :return: List of embedded chunks.
        """
        # Implement embedding logic here
        embedded_chunks = []
        return embedded_chunks

    def initialize_database(self):
        """
        Set up the Milvus database, including connection and schema.
        """
        # Implement database initialization here

    def store_in_milvus(self, embedded_chunks):
        """
        Store embedded chunks in the Milvus database.
        :param embedded_chunks: List of embedded chunks.
        """
        # Implement storage logic here

    def search_database(self, query, k):
        """
        Search the database for K nearest chunks based on the query embedding.
        :param query: Search query.
        :param k: Number of nearest chunks to find.
        :return: Search results.
        """
        # Implement search logic here

    def retrieve_data(self, search_results):
        """
        Fetch chunk data and metadata based on search results.
        :param search_results: Results from the database search.
        :return: Corresponding data and metadata.
        """
        # Implement data retrieval logic here

    def __del__(self):
        """
        Cleanup when an instance is destroyed, like closing database connections.
        """
        # Implement cleanup logic here
    
    def ingest_json(self, file_path):
        """
        Read and parse a JSON file.
        :param file_path: Path to the JSON file.
        :return: Parsed JSON data.
        """
        with open(file_path, 'r') as file:
            data = json.load(file)
        return data



## Configure the database

I'm not sure how we'll do it in the future.

For time being, we are going to do two things now
1. ingest the travel advice json
2. turn that into a local database
   





In [None]:
config = {
    'travel_file':  "./ingest_data_sources/travel-advice-all-countries.json"
}

db_manager = DatabaseManager(config)


### Ingesting

Here's what I'm doing

1. We load the travel advice json into memory (data)
2. We loop around it to turn it into a new structure that better fits our needs.

   Each item will have this structure:
   ```json
   {
       "url": "/foreign-travel-advice/british-indian-ocean-territory",
       "country_name": "British Indian Ocean Territory",
       "content_title": "Summary",
       "part_id": 0,
       "content": "Before you travel, check the 'Entry requirements'....",
       "content_html": "\n<div class=\"example\">\n<p>Before you travel, ..."
   }

3. Note that for any individual country there are several "parts" - this is why we have so much metadata. It's so we can pull back each chunk.
4.  We save the new formatted json as chunk_json_format.json in case we want to skip all this in the future - for time being, I'm assuming a 'dump' of this may change.


In [None]:
data = db_manager.ingest_json(db_manager.config['travel_file'])

new_data_structure = []
for element in data:
    base_path = element["countryInfo"]["base_path"]
    country_name = element["countryInfo"]["details"]["country"]["name"]
    for index, part in enumerate(element["countryInfo"]["details"]["parts"]):
        html_content = part["body"]
        soup = BeautifulSoup(html_content, 'html.parser')
        text_content = soup.get_text()

        # Replace newlines and tabs with a space, and strip leading/trailing whitespaces
        text_content = re.sub(r'\s+', ' ', text_content).strip()

        # Replace the Unicode characters I found with ASCII equivalents (maybe there are others)
        replacements = {
            '\u2018': "'", '\u2019': "'",  # Single quotes
            '\u201c': '"', '\u201d': '"',  # Double quotes
            '\u2026': '...',               # Ellipsis (add anything else as required)
        }
        for unicode_char, ascii_char in replacements.items():
            text_content = text_content.replace(unicode_char, ascii_char)

        new_element = {
            "url": base_path,
            "country_name": country_name,
            "content_title": part["title"],
            "part_id": index,
            "content": text_content,
            "content_html": html_content
        }
        new_data_structure.append(new_element)

new_json_string = json.dumps(new_data_structure, indent=4)

file_name = "chunk_json_format.json"
with open(file_name, 'w', encoding='utf-8') as file:
    file.write(new_json_string)

#print(new_json_string[:5000])  # Adjust the slice as needed

## Milvus Stuff

In [None]:
!ifconfig lo


import pymilvus

from pymilvus import connections
connections.connect(
  alias="default",
  user='username',
  password='password',
  host='127.0.0.1',
  port='19530'
)


lo: flags=73<UP,LOOPBACK,RUNNING>  mtu 65536
        inet 127.0.0.1  netmask 255.0.0.0
        inet6 ::1  prefixlen 128  scopeid 0x10<host>
        loop  txqueuelen 1000  (Local Loopback)
        RX packets 196621  bytes 1191881694 (1.1 GB)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 196621  bytes 1191881694 (1.1 GB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0



MilvusException: <MilvusException: (code=2, message=Fail connecting to server on 127.0.0.1:19530. Timeout)>

In [None]:
#| hide
# Leave this to the bottom so we auto-export code
import nbdev; nbdev.nbdev_export()