In [None]:
!pip install llama-index

In [None]:
import textwrap

import pandas as pd
import phoenix as px
from llama_index import StorageContext, load_index_from_storage
from llama_index.response.schema import Response

In [None]:
database_df = pd.read_parquet("/Users/xandersong/Desktop/llama-index-data/splits/database.parquet")
query_df = pd.read_parquet("/Users/xandersong/Desktop/llama-index-data/splits/query.parquet")

In [None]:
database_df.head()

In [None]:
query_df.head()

In [None]:
storage_context = StorageContext.from_defaults(
    persist_dir="/Users/xandersong/Desktop/llama-index-data/indexes/database_index"
)
index = load_index_from_storage(storage_context)
query_engine = index.as_query_engine()

In [None]:
data = index.storage_context.vector_store.to_dict()
list(data.keys())

In [None]:
lengths = set()
for value in data["embedding_dict"].values():
    lengths.add(len(value))
lengths

In [None]:
def display_llama_index_response(response: Response) -> None:
    """
    Displays a LlamaIndex response and its source nodes.
    """

    print("Response")
    print("========")
    print(response.response.strip())
    print()

    print("Source Nodes")
    print("============")
    print()

    for source_node in response.source_nodes:
        print(f"doc_id: {source_node.node.doc_id}")
        print(f"score: {source_node.score}")
        print()
        for line in textwrap.wrap(source_node.node.text, width=80):
            print(line)
        print()

In [None]:
query = "Where did Beyonce attend elementary school?"
response = query_engine.query(query)
display_llama_index_response(response)

In [None]:
missing_database

In [None]:
import pandas as pd


def sample_by_percentage(df, percentage_map):
    def sample_group(group):
        subject = group.name
        frac = percentage_map.get(subject, 0)
        return group.sample(frac=frac)

    sampled_df = df.groupby("broad_subject").apply(sample_group).reset_index(drop=True)
    return sampled_df


# Example usage
data = {
    "broad_subject": ["Math", "Science", "Math", "Science", "Math", "Science"],
    "score": [90, 85, 95, 88, 92, 80],
}
df = pd.DataFrame(data)

# Define the percentage map
percentage_map = {
    "Math": 0.5,  # Sample 50% of rows for Math
    "Science": 0.3,  # Sample 30% of rows for Science
}

# Apply the function to the DataFrame
sampled_df = sample_by_percentage(df, percentage_map)
print(sampled_df)

In [None]:
broad_subject_to_sample_percentage = {
    "Architecture": 5.0,
    "Business and Economy": 0.5,
    "Education": 0.5,
    "Entertainment and Arts": 0.5,
    "Geography and Places": 0.5,
    "Health and Medicine": 0.5,
    "History": 0.5,
    "Language and Linguistics": 0.5,
    "Law and Legal": 0.5,
    "Literature": 0.5,
    "Media and Communication": 0.5,
    "Music": 0.5,
    "Nature and Environment": 0.5,
    "Nonprofit Organizations": 0.5,
    "People and Ethnicity": 0.5,
    "Philosophy": 0.5,
    "Politics and Government": 0.5,
    "Religion and Spirituality": 0.5,
    "Science and Technology": 0.5,
    "Social Sciences": 0.5,
    "Sports": 0.5,
}