# What

As part of the produciton challenge #172. I need to be able to upload the results to an external server.


## Modules

In [None]:
# Local

# Third Party
import lancedb
import pandas as pd

# Built in
import dotenv
import os

dotenv.load_dotenv(override=True)

# Upload output to viewer vector_db
The end goal will be to take the embeddings datasets and put them into a vector database that is in the cloud.

However right not I am not working on it so I can just create this local script that will take the embedding folder and move it into a vector_db database within the viewer.

In [None]:
db = lancedb.connect("../../viewer/vector_db")
db.table_names()

## Important report text

In [None]:
important_text_embeddings = pd.read_pickle("../../output/embeddings/important_text_embeddings.pkl")

important_text_embeddings.rename(columns={'important_text_embedding': 'vector'}, inplace=True)
db.create_table("important_text_embeddings", important_text_embeddings, mode="overwrite")
important_text_embeddings

## Recommendations

In [None]:
recommendations_embeddings = pd.read_pickle("../../output/embeddings/recommendations_embeddings.pkl")

recommendations_embeddings.rename(columns={'recommendation_embedding': 'vector'}, inplace=True)
db.create_table("recommendation_embeddings", recommendations_embeddings, mode="overwrite")
recommendations_embeddings

## Report sections

In [None]:
report_sections_embeddings = pd.read_pickle("../../output/embeddings/report_sections_embeddings.pkl")

report_sections_embeddings.rename(columns={'section_embedding': 'vector'}, inplace=True)

report_sections_embeddings

In [None]:
report_sections_embeddings['id'] = report_sections_embeddings['report_id'] + '_' + report_sections_embeddings['section'].astype(str)
report_sections_embeddings

## Safety issues

In [None]:
safety_issues_embeddings = pd.read_pickle("../../output/embeddings/safety_issues_embeddings.pkl")
safety_issues_embeddings

In [None]:

safety_issues_embeddings.rename(columns={'safety_issue_embedding': 'vector'}, inplace=True)
safety_issues_embeddings.drop(columns=['safety_issue_embedding_token_length'], inplace=True)
si_table = db.create_table("safety_issue_embeddings", safety_issues_embeddings, mode="overwrite")

si_table.to_pandas()

## Combined table

In [None]:
all_document_dfs = [
    safety_issues_embeddings[['safety_issue_id', 'safety_issue', 'vector', 'report_id', 'year', 'mode', 'type']].assign(document_type="safety_issue"),
    report_sections_embeddings[['id', 'section_text', 'vector', 'report_id', 'year', 'mode', 'type']].assign(document_type="report_section"),
    recommendations_embeddings[['recommendation_id', 'recommendation', 'vector', 'report_id', 'year', 'mode', 'type']].assign(document_type="recommendation"),
    important_text_embeddings[['report_id', 'important_text', 'vector', 'report_id', 'year', 'mode', 'type']].assign(document_type="important_text"),
]

all_document_dfs = [df.set_axis(["document_id", "document", "vector", 'report_id', "year", "mode", "type", "document_type"], axis=1) for df in all_document_dfs]

all_document_types = pd.concat(all_document_dfs, axis = 0, ignore_index = True)

all_document_types

In [None]:
all_document_types_table = db.create_table("all_document_types", all_document_types, mode="overwrite")

all_document_types_table.create_fts_index('document', replace=True)

all_document_types_table.to_pandas()

## Creating test vector_db

In [None]:
uri = '../../tests/data/vector_db'

test_db = lancedb.connect(uri)

In [None]:
all_document_types_test = all_document_types.sample(frac= 0.1, random_state=42)

all_document_types_test_table = test_db.create_table("all_document_types", all_document_types_test, mode="overwrite")
all_document_types_test_table.create_fts_index('document', replace=True)


# Deploy to azure

In [None]:
all_document_types

In [None]:
db = lancedb.connect(os.getenv("db_URI"))

db.table_names()

In [None]:
all_document_types_table = db.create_table("all_document_types", all_document_types, mode="overwrite")

In [None]:
all_document_types_table.create_index(num_sub_vectors=64, metric="cosine", accelerator="cuda", replace = True)

In [None]:
table = db.open_table("all_document_types")

In [None]:
table.to_pandas()