# DatabaseManager

> This will be the interface between an application and the databse.  

> In the immediate, it will initialise the database, read the Travel Advice JSON, ingest it via chunks into the database and it will perform searches given an embedding.

In [None]:
#| default_exp DatabaseManager

In [None]:
#| export
import json
from bs4 import BeautifulSoup
import re

In [None]:
#| export
class DatabaseManager:
    def __init__(self, config):
        """
        Initialize the DatabaseManager with configuration settings.
        :param config: Configuration details for database connection and other settings.
        """
        self.config = config
        # Initialize database connection here



    def chunk_json(self, json_data):
        """
        Divide the JSON data into manageable chunks.
        :param json_data: The parsed JSON data.
        :return: List of chunks.
        """
        # Implement chunking logic here
        chunks = []
        return chunks

    def embed_chunks(self, chunks):
        """
        Create embeddings for each chunk of data.
        :param chunks: List of data chunks.
        :return: List of embedded chunks.
        """
        # Implement embedding logic here
        embedded_chunks = []
        return embedded_chunks

    def initialize_database(self):
        """
        Set up the Milvus database, including connection and schema.
        """
        # Implement database initialization here

    def store_in_milvus(self, embedded_chunks):
        """
        Store embedded chunks in the Milvus database.
        :param embedded_chunks: List of embedded chunks.
        """
        # Implement storage logic here

    def search_database(self, query, k):
        """
        Search the database for K nearest chunks based on the query embedding.
        :param query: Search query.
        :param k: Number of nearest chunks to find.
        :return: Search results.
        """
        # Implement search logic here

    def retrieve_data(self, search_results):
        """
        Fetch chunk data and metadata based on search results.
        :param search_results: Results from the database search.
        :return: Corresponding data and metadata.
        """
        # Implement data retrieval logic here

    def __del__(self):
        """
        Cleanup when an instance is destroyed, like closing database connections.
        """
        # Implement cleanup logic here
    
    def ingest_json(self, file_path):
        """
        Read and parse a JSON file.
        :param file_path: Path to the JSON file.
        :return: Parsed JSON data.
        """
        with open(file_path, 'r') as file:
            data = json.load(file)
        return data



## Configure the database

I'm not sure how we'll do it in the future.

For time being, we are going to do two things now
1. ingest the travel advice json
2. turn that into a local database
   





In [None]:
config = {
    'travel_file':  "./ingest_data_sources/travel-advice-all-countries.json"
}

db_manager = DatabaseManager(config)


### Ingesting

Here's what I'm doing

1. We load the travel advice json into memory (data)
2. We loop around it to turn it into a new structure that better fits our needs.
'''json    
{
    "url": "/foreign-travel-advice/british-indian-ocean-territory",
    "country_name": "British Indian Ocean Territory",
    "content_title": "Summary",
    "part_id": 0,
    "content": "Before you travel, check the 'Entry requirements' section for the British Indian Ocean Territory's current entry restrictions and requirements. Due to COVID-19 these may change with little warning. Monitor this advice for the latest updates and stay in contact with your travel provider. If you plan to pass through another country on your journey, check the travel advice for the country you're transiting. It is more important than ever to get travel insurance and check it provides appropriate cover. See the FCDO's guidance on foreign travel insurance. The British Indian Ocean Territory is a UK Overseas Territory. It is administered from London and there is no British diplomatic or consular representation there. Although there's no recent history of terrorism in the British Indian Ocean Territory, attacks can't be ruled out. See Terrorism",
    "content_html": "\n<div class=\"example\">\n<p>Before you travel, <a href=\"/foreign-travel-advice/british-indian-ocean-territory/entry-requirements#entry-rules-in-response-to-coronavirus-covid-19\">check the \u2018Entry requirements\u2019 section</a> for the British Indian Ocean Territory\u2019s current entry restrictions and requirements. Due to COVID-19 these may change with little warning. Monitor this advice for the latest updates and stay in contact with your travel provider.</p>\n\n<p>If you plan to pass through another country on your journey, check the travel advice for the country you\u2019re transiting.</p>\n</div>\n\n<p>It is more important than ever to get travel insurance and check it provides appropriate cover. See the <abbr title=\"Foreign, Commonwealth &amp; Development Office\">FCDO</abbr>\u2019s <a href=\"/guidance/foreign-travel-insurance\">guidance on foreign travel insurance</a>.</p>\n\n<p>The British Indian Ocean Territory is a UK Overseas Territory. It is administered from London and there is no British diplomatic or consular representation there.</p>\n\n<p>Although there\u2019s no recent history of terrorism in the British Indian Ocean Territory, attacks can\u2019t be ruled out. See <a href=\"/foreign-travel-advice/british-indian-ocean-territory/terrorism\">Terrorism</a></p>\n\n"
}
'''

In [None]:
data = db_manager.ingest_json(db_manager.config['travel_file'])

new_data_structure = []
for element in data:
    base_path = element["countryInfo"]["base_path"]
    country_name = element["countryInfo"]["details"]["country"]["name"]
    for index, part in enumerate(element["countryInfo"]["details"]["parts"]):
        html_content = part["body"]
        soup = BeautifulSoup(html_content, 'html.parser')
        text_content = soup.get_text()

        # Replace newlines and tabs with a space, and strip leading/trailing whitespaces
        text_content = re.sub(r'\s+', ' ', text_content).strip()

        # Replace common Unicode characters with ASCII equivalents
        replacements = {
            '\u2018': "'", '\u2019': "'",  # Single quotes
            '\u201c': '"', '\u201d': '"',  # Double quotes
            '\u2026': '...',               # Ellipsis
            # Add more replacements as needed
        }
        for unicode_char, ascii_char in replacements.items():
            text_content = text_content.replace(unicode_char, ascii_char)

        new_element = {
            "url": base_path,
            "country_name": country_name,
            "content_title": part["title"],
            "part_id": index,
            "content": text_content,
            "content_html": html_content
        }
        new_data_structure.append(new_element)

new_json_string = json.dumps(new_data_structure, indent=4)

file_name = "new_json2.json"
with open(file_name, 'w', encoding='utf-8') as file:
    file.write(new_json_string)

print(new_json_string[:5000])  # Adjust the slice as needed

[
    {
        "url": "/foreign-travel-advice/british-indian-ocean-territory",
        "country_name": "British Indian Ocean Territory",
        "content_title": "Summary",
        "part_id": 0,
    },
    {
        "url": "/foreign-travel-advice/british-indian-ocean-territory",
        "country_name": "British Indian Ocean Territory",
        "content_title": "Coronavirus",
        "part_id": 1,
        "content": "Coronavirus travel health See the TravelHealthPro website for further advice on travel abroad and reducing spread of respiratory viruses during the COVID-19 pandemic. International travel There are no commercial flights to and from Diego Garcia. Access is only allowed for those on pre-authorised official duty. Entry and borders See Entry requirements to find out what you will need to do when you arrive in the British Indian Ocean Territory. Be prepared for your plans to change No travel is risk-free during COVID-19. Countries may further restrict travel or bring in new rul

In [None]:

# Count elements based on the structure
if isinstance(data, list):
    # If it's a list, count the number of items in the list
    number_of_elements = len(data)
    print ("list")
elif isinstance(data, dict):
    # If it's a dictionary, count the number of key-value pairs
    number_of_elements = len(data.keys())
    print ("dictionary)")
else:
    # Other data types (handle as needed)
    number_of_elements = 0
    print ("else")

print("Number of elements in the JSON file:", number_of_elements)


def clear_string_values(data):
    """
    Recursively clears string values in a JSON-like dictionary or list.

    :param data: The JSON-like dictionary or list.
    :return: The modified dictionary or list with empty string values.
    """
    if isinstance(data, dict):
        return {key: clear_string_values(value) if not isinstance(value, str) else "" for key, value in data.items()}
    elif isinstance(data, list):
        return [clear_string_values(item) for item in data]
    else:
        return data

# Sample JSON data
json_data = {
    "Country Slug": "british-indian-ocean-territory",
    # ... rest of the JSON data
}

# Clear string values
cleared_data = clear_string_values(json_data)

# Print the modified data
print(json.dumps(cleared_data, indent=4))

item0 = clear_string_values(data[0])
print (json.dumps(item0, indent=4))

{
    "Country Slug": "",
    "Country Name": "",
    "countryInfo": {
        "analytics_identifier": null,
        "base_path": "",
        "content_id": "",
        "description": "",
        "details": {
            "alert_status": [],
            "change_description": "",
            "change_history": [
                {
                    "note": "",
                    "public_timestamp": ""
                },
                {
                    "note": "",
                    "public_timestamp": ""
                },
                {
                    "note": "",
                    "public_timestamp": ""
                },
                {
                    "note": "",
                    "public_timestamp": ""
                },
                {
                    "note": "",
                    "public_timestamp": ""
                },
                {
                    "note": "",
                    "public_timestamp": ""
                },
                {


In [None]:

# Specify your desired file name
file_name = "empty.json"

# Write JSON data to file
with open(file_name, 'w') as file:
    json.dump(item0, file)

In [None]:
#| hide
# Leave this to the bottom so we auto-export code
import nbdev; nbdev.nbdev_export()