This notebook assumes you've already built the LlamaIndex application and extracted OpenAI embeddings using the `build_database` script.

In [None]:
import numpy as np
import pandas as pd
import os
import sys
import zipfile

from google.cloud import storage

from build_database import download_squad_training_data

database_df, query_df = download_squad_training_data()
database_df = database_df.reset_index(drop=True)
query_df = query_df.reset_index(drop=True)

In [None]:
database_df.head()

In [None]:
query_df.head()

In [None]:
# Read in pre-computed embedding data and add it to the dataframes.
split_to_dataframe = {"database": database_df, "query": query_df}
for split in split_to_dataframe.keys():
    embeddings = []
    for granular_subject in split_to_dataframe[split]["granular_subject"].unique():
        embeddings.append(
            np.load(
                os.path.expanduser(
                    f"~/Desktop/openai-embeddings/splits/{split}/{granular_subject}.npy"
                ),
                allow_pickle=True,
            )
        )
    embeddings_column = np.concatenate(embeddings)
    split_to_dataframe[split]["text_vector"] = embeddings_column
database_df = split_to_dataframe["database"]
query_df = split_to_dataframe["query"]

In [None]:
# granular_subjects = list(
#     set(database_df["granular_subject"].unique().tolist()).union(
#         set(query_df["granular_subject"].unique().tolist())
#     )
# )
# granular_subject_to_count_map = {granular_subject: 1 for granular_subject in granular_subjects}
# granular_subject_to_count_map["Arsenal_F.C."] = 2
# granular_subject_to_count_map["FC_Barcelona"] = 3
# granular_subject_to_count_map["Chicago_Cubs"] = 4

In [None]:
# sample database paragraphs that have a corresponding query entry
query_granular_subject_paragraph_index_pairs = set(
    query_df.apply(lambda row: (row["granular_subject"], row["paragraph_index"]), axis=1).to_list()
)
database_df = database_df[
    database_df.apply(
        lambda row: (row["granular_subject"], row["paragraph_index"])
        in query_granular_subject_paragraph_index_pairs,
        axis=1,
    )
].sample(n=1000)
database_granular_subject_paragraph_index_pairs = set(
    database_df.apply(
        lambda row: (row["granular_subject"], row["paragraph_index"]), axis=1
    ).to_list()
)
database_df

In [None]:
# drop database paragraphs for certain subjects to create clusters of queries that are covered by the database
dropped_database_granular_subjects = [
    "Neptune",
    # "Beyoncé",
    # "American_Idol",
    # "Marvel_Comics",
    "Richard_Feynman",
    "PlayStation_3",
]

In [None]:
# sample one query entry per database paragraph
query_df = query_df[
    (
        query_df.apply(
            lambda row: (row["granular_subject"], row["paragraph_index"])
            in database_granular_subject_paragraph_index_pairs,
            axis=1,
        )
    )
    | (query_df["granular_subject"].isin(dropped_database_granular_subjects))
]
print(len(query_df))
query_df = (
    query_df.groupby(["granular_subject", "paragraph_index"], as_index=False)
    .first()
    .reset_index(drop=True)
)
print(len(query_df))
query_df.head()

In [None]:
database_df = database_df[~database_df["granular_subject"].isin(dropped_database_granular_subjects)]

In [None]:
# drop columns that should not be displayed in the tutorial notebook.
query_df = query_df.drop(columns=["id", "paragraph_index", "is_answerable"])

In [None]:
def upload_to_gcs(bucket_name, source_file_path, destination_blob_name):
    client = storage.Client(project="public-assets")
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_path)
    print(
        f"File {source_file_path} uploaded to {bucket_name}/{destination_blob_name} successfully!"
    )

In [None]:
# upload dataframes to GCS
bucket_name = "arize-assets"
llama_index_gcs_path = "phoenix/datasets/unstructured/llm/llama-index"

for split, dataframe in {"database": database_df, "query": query_df}.items():
    file_name = f"{split}.parquet"
    # file_name = f"{split}_full.parquet"
    save_path = f"/tmp/{file_name}"
    dataframe = dataframe.reset_index(drop=True)
    dataframe.to_parquet(save_path)
    upload_to_gcs(
        bucket_name=bucket_name,
        source_file_path=save_path,
        destination_blob_name=f"{llama_index_gcs_path}/{file_name}",
    )

In [None]:
# upload database index
def zip_directory(directory_path, output_path):
    with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(directory_path):
            for file in files:
                if file.endswith(".json"):
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, directory_path)
                    zipf.write(file_path, arcname)


print("Zipping database index...")
zip_directory(
    directory_path=os.path.expanduser("~/Desktop/llama-index-data-full/indexes/database_index"),
    output_path="/tmp/database_index.zip",
)

print("Uploading database index...")
upload_to_gcs(
    bucket_name=bucket_name,
    source_file_path="/tmp/database_index.zip",
    destination_blob_name=f"{llama_index_gcs_path}/database_index.zip",
)