In [1]:
import openai
from dotenv import load_dotenv
import os
import sys

sys.path.append("../find_similar_risk")
from embedding_providers import (
    OpenAIEmbeddingProvider,
    SentenceTransformerProvider,
    GeminiEmbeddingProvider,
)

env_path = "/Users/ford/Documents/coding/confidential/.env"
load_dotenv(env_path)
api_key = os.getenv("OPENAI_API_KEY")
assert api_key, "API key is missing"
# Set your OpenAI API key
openai.api_key = api_key

# Initialize embedding providers with caching
embedding_models = {
    "openai-large": OpenAIEmbeddingProvider(model_name="text-embedding-3-large"),
}

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
test_input_sentence = "test_input_sentence"
result = embedding_models["openai-large"].get_embedding(test_input_sentence)

In [3]:
# test_tuple_key_dict
a_dict = {}
a_dict[tuple(sorted([1, 2]))] = "a"
a_dict[tuple(sorted([2, 1]))]

'a'

In [4]:
import json
import os

# file_name = "result/250327-company_risk_data"
file_name = "result/250520-company_risk_data"
# file_name = "result/250528-company_risk_data"
data_path = f"{file_name}.json"
data = json.load(open(data_path))
data[:3]

[{'company': 'lotus_south',
  'risk_cat': 'Operational Risk',
  'risk': 'Business interruption from fire hazards',
  'risk_desc': 'The business interruption due to a fire incident that occurs within the workplace premises can result in damage to buildings, property, and various equipment. This damage may necessitate a temporary or permanent cessation of business operations.\n\nBusiness interruption: refers to a period during which normal business operations are disrupted or halted, resulting in reduced operational efficiency, loss of revenue, or increased costs. 火灾事故导致的业务中断，可能会对建筑物、财产和各种设备造成损害。这种损害可能需要暂时或永久停止商业运营。',
  'risk_level': 2,
  'rootcause': 'rootcause :Insufficient Maintenance of Fire Safety Equipment: 消防措施不足；电气系统维护不当。',
  'process': 'process :Facilities Management: 安全维保部, Maintenance: 安全维保部'},
 {'company': 'lotus_south',
  'risk_cat': 'Operational Risk',
  'risk': 'Business interruption from natural disasters',
  'risk_desc': 'Business interruption from natural disasters such

In [5]:
from itertools import combinations
from tqdm import tqdm

data_with_embedding = data.copy()
interest_columns = ["risk_desc", "rootcause", "process"]  # example list
all_combinations = []
for r in range(1, len(interest_columns) + 1):
    all_combinations.extend(combinations(interest_columns, r))
all_combinations = [list(c) for c in all_combinations]
all_combinations = [["risk"] + c for c in all_combinations]
all_combinations = [["risk"]] + all_combinations
for i in tqdm(range(len(data_with_embedding)), desc="Embedding rows"):
    for combination in all_combinations:
        interest_columns = combination
        key = tuple(sorted(interest_columns + ["embedding"]))
        data_list = []
        for column in interest_columns:
            data_list.append(data_with_embedding[i][column])
        try:
            data = "\n".join(data_list)
        except:
            print(data_list)
            raise
        data_with_embedding[i][key] = embedding_models["openai-large"].get_embedding(
            data
        )

Embedding rows: 100%|██████████| 269/269 [00:02<00:00, 112.00it/s]


In [6]:
data_with_embedding[:1]

[{'company': 'lotus_south',
  'risk_cat': 'Operational Risk',
  'risk': 'Business interruption from fire hazards',
  'risk_desc': 'The business interruption due to a fire incident that occurs within the workplace premises can result in damage to buildings, property, and various equipment. This damage may necessitate a temporary or permanent cessation of business operations.\n\nBusiness interruption: refers to a period during which normal business operations are disrupted or halted, resulting in reduced operational efficiency, loss of revenue, or increased costs. 火灾事故导致的业务中断，可能会对建筑物、财产和各种设备造成损害。这种损害可能需要暂时或永久停止商业运营。',
  'risk_level': 2,
  'rootcause': 'rootcause :Insufficient Maintenance of Fire Safety Equipment: 消防措施不足；电气系统维护不当。',
  'process': 'process :Facilities Management: 安全维保部, Maintenance: 安全维保部',
  ('embedding',
   'risk'): array([ 0.00782128, -0.01295227, -0.00216153, ..., -0.00683146,
          0.00437382, -0.00690337], shape=(3072,)),
  ('embedding',
   'risk',
   'risk_desc')

In [7]:
# save data_with_embedding to pickle
import pickle

with open(f"{file_name}_with_embedding.pkl", "wb") as f:
    pickle.dump(data_with_embedding, f)