## SCRIPT TO EXTRACT EXISTING TEXT EMBEDDINGS INTO A NEW WORKFLOW WITH NEW LOOKUP TABLES

In [1]:
import sys
sys.path.insert(1, '../../')

In [2]:
import pandas as pd

### SET VALUES FOR THE INDEX FOLDER TO BE EXTRACTED

In [3]:
# set local folder where the index data is located
LOCAL_ROOT = "/Users/gaudy-microsoft/Repositories/unified-copilot/app/data/CHRISTMAS-CAROL"

# value to decide if the original file should maintain or remove the embedding column
REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE = True

#identifier field
STANDARD_IDENTIFIER_FIELD = "id"

#new embedding field name
NEW_STANDARD_EMBEDDING_FIELD = "embedding"

In [4]:
def extract_text_embedding_from_table(input_path: str, embedding_field: str, embeddings_parquet_output_field: str):
    """Migrate table for embeddings."""
    original_df = pd.read_parquet(input_path)
    no_embeddings_df = original_df.drop(columns=[embedding_field])
    
    embeddings_df = original_df[[STANDARD_IDENTIFIER_FIELD, embedding_field]]
    embeddings_df = embeddings_df.rename(columns={embedding_field: NEW_STANDARD_EMBEDDING_FIELD}) # type: ignore
    embeddings_df.to_parquet(embeddings_parquet_output_field, index=False)

    if REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE is True:
        no_embeddings_df.to_parquet(input_path, index=False)

### EMBEDDINGS TO MIGRATE IN FILE: `create_final_community_reports.parquet`

In [None]:
#input file with the embedding column
INPUT_PATH = f"{LOCAL_ROOT}/create_final_community_reports.parquet"

#output file for embeddings
EMBEDDINGS_PARQUET_OUTPUT_PATH = f"{LOCAL_ROOT}/create_final_community_reports_embeddings.parquet"

#output file without embeddings
NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f"{LOCAL_ROOT}/create_final_community_reports.parquet"

#embedding field
EMBEDDING_FIELD = "full_content_embedding"

extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)

In [None]:
#input file with the embedding column
INPUT_PATH = f"{LOCAL_ROOT}/create_final_community_reports.parquet"

#output file for embeddings
EMBEDDINGS_PARQUET_OUTPUT_PATH = f"{LOCAL_ROOT}/create_final_community_reports_embeddings.parquet"

#output file without embeddings
NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f"{LOCAL_ROOT}/create_final_community_reports.parquet"

#embedding field
EMBEDDING_FIELD = "summary_embedding"

extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)

In [None]:
#input file with the embedding column
INPUT_PATH = f"{LOCAL_ROOT}/create_final_community_reports.parquet"

#output file for embeddings
EMBEDDINGS_PARQUET_OUTPUT_PATH = f"{LOCAL_ROOT}/create_final_community_reports_embeddings.parquet"

#output file without embeddings
NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f"{LOCAL_ROOT}/create_final_community_reports.parquet"

#embedding field
EMBEDDING_FIELD = "title_embedding"

extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)

### EMBEDDINGS TO MIGRATE IN FILE: `create_final_documents.parquet`

In [None]:
#input file with the embedding column
INPUT_PATH = f"{LOCAL_ROOT}/create_final_documents.parquet"

#output file for embeddings
EMBEDDINGS_PARQUET_OUTPUT_PATH = f"{LOCAL_ROOT}/create_final_documents_embeddings.parquet"

#output file without embeddings
NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f"{LOCAL_ROOT}/create_final_documents.parquet"

#embedding field
EMBEDDING_FIELD = "raw_content_embedding"

extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)

### EMBEDDINGS TO MIGRATE IN FILE: `create_final_entities.parquet`

In [None]:
#input file with the embedding column
INPUT_PATH = f"{LOCAL_ROOT}/create_final_entities.parquet"

#output file for embeddings
EMBEDDINGS_PARQUET_OUTPUT_PATH = f"{LOCAL_ROOT}/create_final_entities_embeddings.parquet"

#output file without embeddings
NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f"{LOCAL_ROOT}/create_final_entities.parquet"

#embedding field
EMBEDDING_FIELD = "name_embedding"

extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)

In [None]:
#input file with the embedding column
INPUT_PATH = f"{LOCAL_ROOT}/create_final_entities.parquet"

#output file for embeddings
EMBEDDINGS_PARQUET_OUTPUT_PATH = f"{LOCAL_ROOT}/create_final_entities_embeddings.parquet"

#output file without embeddings
NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f"{LOCAL_ROOT}/create_final_entities.parquet"

#embedding field
EMBEDDING_FIELD = "description_embedding"

extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)

### EMBEDDINGS TO MIGRATE IN FILE: `create_final_text_units.parquet`

In [None]:
#input file with the embedding column
INPUT_PATH = f"{LOCAL_ROOT}/create_final_text_units.parquet"

#output file for embeddings
EMBEDDINGS_PARQUET_OUTPUT_PATH = f"{LOCAL_ROOT}/create_final_text_units_embeddings.parquet"

#output file without embeddings
NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f"{LOCAL_ROOT}/create_final_text_units.parquet"

#embedding field
EMBEDDING_FIELD = "text_embedding"

extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)