In [None]:
import json
import os
import random

import pandas as pd
import pyarrow as pa
from dotenv import dotenv_values
from openai import OpenAI
from pyarrow import parquet as pq
from tqdm import tqdm

## Idea
Load all the risks generated by OpenAI and compute embeddings for those using OpenAI.

In [None]:
config = dotenv_values(".env")
client = OpenAI(api_key=config["OPEN_API_KEY"])

In [None]:
def get_embedding(text, model="text-embedding-3-large"):
    return client.embeddings.create(input=text, model=model, encoding_format="float").data[0].embedding

In [None]:
data_folder = './data/staging/risks'
risk_dicts = []

if os.path.exists(data_folder) and os.path.isdir(data_folder):
    for file_name in tqdm(os.listdir(data_folder)):
        with open(os.path.join(data_folder, file_name)) as fp:
            try:
                content = fp.read()
                data = json.loads(json.loads(content))
                data['symbol_name'] = file_name  # Add the file name field
                risk_dicts.append(data)
                print(len(risk_dicts))
            except Exception as e:
                print(file_name)

In [None]:
df = pd.DataFrame(data=risk_dicts)

In [None]:
all_risks = set(df['risks'].explode())
random.choices(list(all_risks), k=2)

In [None]:
len(all_risks)

In [None]:
risk_to_embedding = []
exceptions_risks = []

for risk in tqdm(all_risks):
    try:
        embedding = get_embedding(risk)
        risk_to_embedding.append({"risk_name": risk, "embedding": embedding})
    except Exception as e:
        exceptions_risks.append(risk)
        print(e)

In [None]:
len(exceptions_risks)

In [None]:
result = pa.Table.from_pylist(risk_to_embedding)

In [None]:
pq.write_table(result, "./data/staging/risk_to_embedding_openai.parquet")