In [1]:
# DON'T RERUN THE SAVE FOR TERMINOLOGY VECTOR STORE IF IT ALREADY EXISTS

In [None]:
from langchain.schema import Document

# Glossary entries (term, meaning)
glossary_entries = [
    ("NR", "Not recommended"),
    ("PF", "Personalized feed"),
    ("GH", "Geo-handler; a module responsible for routing features based on user region"),
    ("CDS", "Compliance Detection System"),
    ("DRT", "Data retention threshold; duration for which logs can be stored"),
    ("LCP", "Local compliance policy"),
    ("Redline", "Flag for legal review (different from its traditional business use for 'financial loss')"),
    ("Softblock", "A user-level limitation applied silently without notifications"),
    ("Spanner", "A synthetic name for a rule engine (not to be confused with Google Spanner)"),
    ("ShadowMode", "Deploy feature in non-user-impact way to collect analytics only"),
    ("T5", "Tier 5 sensitivity data; more critical than T1–T4 in this internal taxonomy"),
    ("ASL", "Age-sensitive logic"),
    ("Glow", "A compliance-flagging status, internally used to indicate geo-based alerts"),
    ("NSP", "Non-shareable policy (content should not be shared externally)"),
    ("Jellybean", "Feature name for internal parental control system"),
    ("EchoTrace", "Log tracing mode to verify compliance routing"),
    ("BB", "Baseline Behavior; standard user behavior used for anomaly detection"),
    ("Snowcap", "A synthetic codename for the child safety policy framework"),
    ("FR", "Feature rollout status"),
    ("IMT", "Internal monitoring trigger"),
]

# Convert into documents
terminology_docs = [
    Document(page_content=meaning, metadata={"term": term})
    for term, meaning in glossary_entries
]


In [3]:
# azure open AI key
import os
from dotenv import load_dotenv

load_dotenv()

if not os.environ.get("AZURE_OPENAI_API_KEY"):
  os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")

In [4]:
# embed/store
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import Chroma
import os

embeddings = AzureOpenAIEmbeddings(
    azure_endpoint=os.environ["AZURE_OPENAI_EMBEDDINGS_ENDPOINT"],
    azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"],
    openai_api_version=os.environ["AZURE_OPENAI_EMBEDDINGS_API_VERSION"],
)

vectordb = Chroma.from_documents(
    terminology_docs, embeddings, persist_directory="terminology_vector_store"
)
vectordb.persist()


  vectordb.persist()


In [5]:
embeddings.embed_query("PF")

[-0.017269421368837357,
 -0.022051503881812096,
 0.03016827441751957,
 0.05060210078954697,
 0.011302459985017776,
 -0.046855900436639786,
 -0.01709914021193981,
 0.05267386510968208,
 -0.007889727130532265,
 -0.008875943720340729,
 0.04594773054122925,
 0.014218537136912346,
 0.010961896739900112,
 0.006211741361767054,
 0.016063258051872253,
 -0.016545724123716354,
 0.008897228166460991,
 0.015978116542100906,
 0.010302053764462471,
 0.037036310881376266,
 0.02303062565624714,
 0.020859530195593834,
 -0.02840869501233101,
 -0.044755756855010986,
 -0.03890940919518471,
 -0.02911820262670517,
 -0.03215489536523819,
 -0.028139080852270126,
 0.010429765097796917,
 0.028749259188771248,
 0.05153864994645119,
 -0.02864992804825306,
 -0.015524031594395638,
 -0.021838651970028877,
 0.022874534130096436,
 -0.0016398500883951783,
 -0.054092880338430405,
 0.038285043090581894,
 0.009181031957268715,
 0.06663697957992554,
 0.012941423803567886,
 0.0008013004553504288,
 -0.0034482080955058336,
 0