In [1]:
import openai
from dotenv import load_dotenv
import os
import sys

sys.path.append("../find_similar_risk")
from embedding_providers import (
    OpenAIEmbeddingProvider,
    SentenceTransformerProvider,
    GeminiEmbeddingProvider,
)

env_path = "/Users/ford/Documents/coding/confidential/.env"
load_dotenv(env_path)
api_key = os.getenv("OPENAI_API_KEY")
assert api_key, "API key is missing"
# Set your OpenAI API key
openai.api_key = api_key

# Initialize embedding providers with caching
embedding_models = {
    "openai-large": OpenAIEmbeddingProvider(model_name="text-embedding-3-large"),
}

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
test_input_sentence = "test_input_sentence"
result = embedding_models["openai-large"].get_embedding(test_input_sentence)

In [3]:
# test_tuple_key_dict
a_dict = {}
a_dict[tuple(sorted([1, 2]))] = "a"
a_dict[tuple(sorted([2, 1]))]

'a'

In [4]:
import json
import os

# file_name = "result/250327-company_risk_data"
# file_name = "result/250520-company_risk_data"
file_name = "result/250528-company_risk_data"
data_path = f"{file_name}.json"
data = json.load(open(data_path))
# remove risk_level key
# for i in data:
#     del i["risk_level"]
data[:3]

[{'company': 'PCG',
  'risk_cat': 'Operational Risk',
  'risk': 'Accounting errors',
  'risk_desc': "Accounting error refers to inaccuracies in the recording, calculation, or reporting of financial transactions within an organization's accounting system. These errors can vary from simple mathematical mistakes to more complex issues like misapplication of accounting principles or estimation errors. การแจ้งรายละเอียดค่าใช้จ่ายที่เกิดขึ้นในคลังมีความคลาดเคลื่อน ส่งผลให้ค่าใช้จ่ายด้านโลจิสติกส์ (Logistic Cost) ไม่สะท้อนถึงความเป็นจริง และอาจทำให้การคำนวณค่าใช้จ่ายไม่สมเหตุสมผล",
  'risk_level': 1,
  'rootcause': 'rootcause :Mistakes in calculations, data entry, or judgment errors during the accounting process: -',
  'process': 'process :Finance and Accounting: -'},
 {'company': 'PCG',
  'risk_cat': 'Operational Risk',
  'risk': 'Business interruption from fire hazards',
  'risk_desc': 'The business interruption due to a fire incident that occurs within the workplace premises can result in 

In [5]:
from itertools import combinations
from tqdm import tqdm

data_with_embedding = data.copy()
interest_columns = ["risk_desc", "rootcause", "process"]  # example list
all_combinations = []
for r in range(1, len(interest_columns) + 1):
    all_combinations.extend(combinations(interest_columns, r))
all_combinations = [list(c) for c in all_combinations]
all_combinations = [["risk"] + c for c in all_combinations]
all_combinations = [["risk"]] + all_combinations
for i in tqdm(range(len(data_with_embedding)), desc="Embedding rows"):
    for combination in all_combinations:
        interest_columns = combination
        key = tuple(sorted(interest_columns + ["embedding"]))
        data_list = []
        for column in interest_columns:
            data_list.append(data_with_embedding[i][column])
        try:
            data = "\n".join(data_list)
        except:
            print(data_list)
            raise
        data_with_embedding[i][key] = embedding_models["openai-large"].get_embedding(
            data
        )

Embedding rows:   1%|          | 2/300 [00:00<00:20, 14.47it/s]

Embedding rows: 100%|██████████| 300/300 [00:03<00:00, 88.11it/s] 


In [6]:
print(type(data))
data

<class 'str'>


'Non-compliance with laws/regulations related to OT\nThis risk refers to situation where the organization does not comply with laws and regulations related to Operational Technology, such as\n- The Cybersecurity Act (especially for Critical Services)\n\n- Lack of standardized OT Policies/Processes/Procedures\n- Lack of OT related Laws/Regulations Awareness\n- Lack of effective monitoring for Cybersecurity Act (Critical Services)\n- Inadequate compliance management systems\n'

In [7]:
data_with_embedding[:1]

[{'company': 'PCG',
  'risk_cat': 'Operational Risk',
  'risk': 'Accounting errors',
  'risk_desc': "Accounting error refers to inaccuracies in the recording, calculation, or reporting of financial transactions within an organization's accounting system. These errors can vary from simple mathematical mistakes to more complex issues like misapplication of accounting principles or estimation errors. การแจ้งรายละเอียดค่าใช้จ่ายที่เกิดขึ้นในคลังมีความคลาดเคลื่อน ส่งผลให้ค่าใช้จ่ายด้านโลจิสติกส์ (Logistic Cost) ไม่สะท้อนถึงความเป็นจริง และอาจทำให้การคำนวณค่าใช้จ่ายไม่สมเหตุสมผล",
  'risk_level': 1,
  'rootcause': 'rootcause :Mistakes in calculations, data entry, or judgment errors during the accounting process: -',
  'process': 'process :Finance and Accounting: -',
  ('embedding',
   'risk'): array([-0.01385025,  0.04698864,  0.00528706, ...,  0.00213661,
         -0.0024173 ,  0.00280692], shape=(3072,)),
  ('embedding',
   'risk',
   'risk_desc'): array([-0.01377645,  0.01044742, -0.01182

In [8]:
# save data_with_embedding to pickle
import pickle

with open(f"{file_name}_with_embedding.pkl", "wb") as f:
    pickle.dump(data_with_embedding, f)