In [None]:
import textwrap

import pandas as pd
import phoenix as px
from langchain import OpenAI
from llama_index import StorageContext, load_index_from_storage
from llama_index.response.schema import Response

In [None]:
database_df = pd.read_parquet(
    "/Users/xandersong/Desktop/llama-index-data-full/splits/database.parquet"
)
query_df = pd.read_parquet("/Users/xandersong/Desktop/llama-index-data/splits/query.parquet")
# database_df = pd.read_parquet("/Users/xandersong/Downloads/database_openai.parquet")
# query_df = pd.read_parquet("/Users/xandersong/Downloads/query_openai.parquet")

In [None]:
paragraph_indexes = []
paragraph_index = 0
previous_granular_subject = None
for granular_subject in database_df["granular_subject"].to_list():
    if granular_subject != previous_granular_subject:
        paragraph_index = 0
    previous_granular_subject = granular_subject
    paragraph_indexes.append(paragraph_index)
    paragraph_index += 1
database_df["paragraph_index"] = paragraph_indexes

In [None]:
llama_database_df = pd.read_parquet(
    "/Users/xandersong/Desktop/llama-index-data-full/splits/database.parquet"
)
llama_query_df = pd.read_parquet("/Users/xandersong/Desktop/llama-index-data/splits/query.parquet")
openai_api_database_df = pd.read_parquet("/Users/xandersong/Downloads/database_openai.parquet")
openai_api_query_df = pd.read_parquet("/Users/xandersong/Downloads/query_openai.parquet")

In [None]:
query_df[(query_df["is_answerable"] == "True")].head()["text"].to_list()

In [None]:
query_df[(query_df["is_answerable"] == "False")].head()["text"].to_list()

In [None]:
storage_context = StorageContext.from_defaults(
    persist_dir="/Users/xandersong/Desktop/llama-index-data-full/indexes/database_index"
)
# model_name = "text-davinci-003"
model_name = "gpt-4"
llm = OpenAI(temperature=0, model_name=model_name)
index = load_index_from_storage(storage_context, llm=llm)
query_engine = index.as_query_engine()

In [None]:
def display_llama_index_response(response: Response) -> None:
    """
    Displays a LlamaIndex response and its source nodes.
    """

    print("Response")
    print("========")
    for line in textwrap.wrap(response.response.strip(), width=80):
        print(line)
    print()

    print("Source Nodes")
    print("============")
    print()

    for source_node in response.source_nodes:
        print(f"doc_id: {source_node.node.doc_id}")
        print(f"score: {source_node.score}")
        print()
        for line in textwrap.wrap(source_node.node.text, width=80):
            print(line)
        print()

In [None]:
# query = 'What is the name of the character Microsoft used to make Windows 8 seem more personable?'
query = "On what street does the Santa Monica Freeway begin?"
response = query_engine.query(query)
display_llama_index_response(response)

In [None]:
# database_df_ = database_df[database_df["broad_subject"] == "Politics and Government"]
database_df_ = database_df.sample(frac=1)
database_df_["is_answerable"] = "unknown"
# query_df_ = query_df[query_df["broad_subject"] == "Politics and Government"]
query_df_ = query_df.sample(frac=1)
query_df_["is_answerable"] = query_df_["is_answerable"].astype(str)

In [None]:
schema = px.Schema(
    embedding_feature_column_names={
        "text_embedding": px.EmbeddingColumnNames(
            vector_column_name="text_vector",
            raw_data_column_name="text",
        )
    },
    tag_column_names=["granular_subject", "broad_subject", "is_answerable"],
)

In [None]:
database_ds = px.Dataset(database_df_, schema, name="database")
query_ds = px.Dataset(query_df_, schema, name="query")

In [None]:
import phoenix as px

px.launch_app(primary=query_ds, reference=database_ds)

In [None]:
for granular_subject in dropped_database_granular_subjects:
    print(granular_subject in database_df_["granular_subject"].to_list())
    print(granular_subject in query_df_["granular_subject"].to_list())
    print(query_df_[query_df_["granular_subject"] == granular_subject].shape)

In [None]:
database_df_["granular_subject"]

In [None]:
print(database_df_["granular_subject"])

In [None]:
database_df_ = database_df[database_df["granular_subject"].isin(random_subjects)]
query_df_ = query_df[query_df["granular_subject"].isin(random_subjects)]

In [None]:
database_df_ = database_df.sample(n=1000)
paragraphs = set(
    database_df_.apply(
        lambda row: (row["granular_subject"], row["paragraph_index"]), axis=1
    ).to_list()
)
paragraphs

In [None]:
!pip install tiktoken
import tiktoken


def get_token_count(text):
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    return len(tokens)


database_df["text"].map(get_token_count).sort_values(ascending=False).head(10)

In [None]:
import numpy as np
import pandas as pd
import os

import sys

sys.path.append("/Users/xandersong/phoenix/examples/llama-index/")

from build_database import download_squad_training_data

database_df, query_df = download_squad_training_data()
database_df = database_df.reset_index(drop=True)
query_df = query_df.reset_index(drop=True)

In [None]:
split_to_dataframe = {"database": database_df, "query": query_df}
for split in split_to_dataframe.keys():
    embeddings = []
    for granular_subject in split_to_dataframe[split]["granular_subject"].unique():
        embeddings.append(
            np.load(
                f"/Users/xandersong/Desktop/openai-embeddings/splits/{split}/{granular_subject}.npy",
                allow_pickle=True,
            )
        )
    embeddings_column = np.concatenate(embeddings)
    split_to_dataframe[split]["text_vector"] = embeddings_column
database_df = split_to_dataframe["database"]
query_df = split_to_dataframe["query"]

In [None]:
granular_subjects = list(
    set(database_df["granular_subject"].unique().tolist()).union(
        set(query_df["granular_subject"].unique().tolist())
    )
)
granular_subject_to_count_map = {granular_subject: 1 for granular_subject in granular_subjects}
granular_subject_to_count_map["Arsenal_F.C."] = 2
granular_subject_to_count_map["FC_Barcelona"] = 3
granular_subject_to_count_map["Chicago_Cubs"] = 4

In [None]:
dropped_database_granular_subjects = [
    "Neptune",
    # "Beyoncé",
    # "American_Idol",
    # "Marvel_Comics",
    "Richard_Feynman",
    "PlayStation_3",
]

In [None]:
query_granular_subject_paragraph_index_pairs = set(
    query_df.apply(lambda row: (row["granular_subject"], row["paragraph_index"]), axis=1).to_list()
)
sample_database_df = database_df[
    database_df.apply(
        lambda row: (row["granular_subject"], row["paragraph_index"])
        in query_granular_subject_paragraph_index_pairs,
        axis=1,
    )
].sample(n=1000)
database_granular_subject_paragraph_index_pairs = set(
    sample_database_df.apply(
        lambda row: (row["granular_subject"], row["paragraph_index"]), axis=1
    ).to_list()
)
sample_database_df

In [None]:
sample_query_df = query_df[
    (
        query_df.apply(
            lambda row: (row["granular_subject"], row["paragraph_index"])
            in database_granular_subject_paragraph_index_pairs,
            axis=1,
        )
    )
    | (query_df["granular_subject"].isin(dropped_database_granular_subjects))
]
print(len(sample_query_df))
# sample_query_df = sample_query_df.groupby(["granular_subject", "paragraph_index"], as_index=False).apply(lambda group: group.head(granular_subject_to_count_map[group.name[0]])).reset_index(drop=True)
sample_query_df = (
    sample_query_df.groupby(["granular_subject", "paragraph_index"], as_index=False)
    .first()
    .reset_index(drop=True)
)
print(len(sample_query_df))
sample_query_df

In [None]:
# granular_subject = "Beyoncé"
# granular_subject = "Richard_Feynman"
granular_subject = "Neptune"
sample_query_df[sample_query_df["granular_subject"] == granular_subject]

In [None]:
database_df_ = sample_database_df.sample(frac=1.0)
query_df_ = sample_query_df.sample(frac=1.0)
database_df_["is_answerable"] = "unknown"
query_df_["is_answerable"] = query_df_["is_answerable"].astype(str)

In [None]:
database_df_ = database_df_[
    ~database_df_["granular_subject"].isin(dropped_database_granular_subjects)
]

In [None]:
from google.cloud import storage


def upload_to_gcs(bucket_name, source_file_path, destination_blob_name):
    client = storage.Client(project="public-assets")
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_path)
    print(
        f"File {source_file_path} uploaded to {bucket_name}/{destination_blob_name} successfully!"
    )

In [None]:
# granular_subject = "Beyoncé"
granular_subject = "Neptune"
# granular_subject = "Richard_Feynman"
query_df_[query_df_["granular_subject"] == granular_subject]

In [None]:
bucket_name = "arize-assets"
llama_index_gcs_path = "phoenix/datasets/unstructured/llm/llama-index"

for split, dataframe in {"database": database_df_, "query": query_df_}.items():
    file_name = f"{split}.parquet"
    save_path = f"/tmp/{file_name}"
    dataframe.to_parquet(save_path)
    upload_to_gcs(
        bucket_name=bucket_name,
        source_file_path=save_path,
        destination_blob_name=f"{llama_index_gcs_path}/{file_name}",
    )

In [None]:
import zipfile
import os


def zip_directory(directory_path, output_path):
    with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(directory_path):
            for file in files:
                if file.endswith(".json"):
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, directory_path)
                    zipf.write(file_path, arcname)


print("Zipping database index...")
zip_directory(
    directory_path="/Users/xandersong/Desktop/llama-index-data-full/indexes/database_index",
    output_path="/tmp/database_index.zip",
)

print("Uploading database index...")
upload_to_gcs(
    bucket_name=bucket_name,
    source_file_path="/tmp/database_index.zip",
    destination_blob_name=f"{llama_index_gcs_path}/database_index.zip",
)