In [None]:
import os
import math
import glob
import json

import pandas as pd
import numpy as np

LINES_PER_FILE = 50000

df_sample = pd.read_csv("../data/focal_papers.csv")
df_generated = pd.read_csv("../data/generated_references.csv")
df_ground_truth = pd.read_csv("../data/ground_truth_references.csv")

In [15]:
# # focal papers
# with open("post_openai_embeddings_focal_papers.jsonl", 'w') as focal_file:
#     for idx, row in df_sample.iterrows():
#         payload = {
#             "custom_id":f"focal_id_{idx}",
#             "method": "POST",
#             "url": "/v1/embeddings",
#             "body": {
#                     "input": row["PaperTitle"],  
#                     "model": "text-embedding-3-large",
#                     "encoding_format": "float",
#                     'dimensions': 3072,
#                 }
#         }
#         focal_file.write(json.dumps(payload) + '\n')

In [3]:
# focal abstract papers
with open("post_openai_embeddings_focal_abstract_papers.jsonl", 'w') as focal_file:
    for idx, row in df_sample.iterrows():
        payload = {
            "custom_id":f"focal_id_{idx}",
            "method": "POST",
            "url": "/v1/embeddings",
            "body": {
                    # add abstract to the input
                    # we need to open abstract and read from file, "f{row["PaperID"]}.txt" is the filename
                    "input": row["PaperTitle"] + " " + open(f"abstracts/{row['PaperID']}.txt", 'r').read(), 
                    "model": "text-embedding-3-large",
                    "encoding_format": "float",
                    'dimensions': 3072,
                }
        }
        focal_file.write(json.dumps(payload) + '\n')

In [None]:
# # ground truth
# total_files = math.ceil(len(df_ground_truth) / LINES_PER_FILE)

# for file_num in range(total_files):
#     start_idx = file_num * LINES_PER_FILE
#     end_idx = min(
#         (file_num + 1) * LINES_PER_FILE, len(df_ground_truth)
#     )

#     filename = f"post_openai_embeddings_ground_truth_{file_num + 1}.jsonl"
    
#     with open(filename, 'w') as ground_truth_file:
#         for idx in range(start_idx, end_idx):
#             payload = {
#                 "custom_id": f"ground_truth_id_{idx}",
#                 "method": "POST",
#                 "url": "/v1/embeddings",
#                 "body": {
#                     "input": df_ground_truth.iloc[idx]["PaperTitle"],
#                     "model": "text-embedding-3-large",
#                     "encoding_format": "float",
#                     'dimensions': 3072,
#                 }
#             }
#             ground_truth_file.write(json.dumps(payload) + '\n')
    
#     print(f"Created file {filename} with {end_idx - start_idx} entries")

# print(f"Successfully created {total_files} files")

In [None]:
# # generated
# total_files = math.ceil(len(df_generated) / LINES_PER_FILE)

# for file_num in range(total_files):
#     start_idx = file_num * LINES_PER_FILE
#     end_idx = min(
#         (file_num + 1) * LINES_PER_FILE, len(df_generated)
#     )

#     filename = f"post_openai_embeddings_generated_{file_num + 1}.jsonl"

#     with open(filename, 'w') as generated_file:
#         for idx in range(start_idx, end_idx):
#             payload = {
#                 "custom_id": f"generated_id_{idx}",
#                 "method": "POST",
#                 "url": "/v1/embeddings",
#                 "body": {
#                     "input": df_generated.iloc[idx]["Title"],
#                     "model": "text-embedding-3-large",
#                     "encoding_format": "float",
#                     'dimensions': 3072,
#                 }
#             }
#             generated_file.write(json.dumps(payload) + '\n')
    
#     print(f"Created file {filename} with {end_idx - start_idx} entries")

# print(f"Successfully created {total_files} files")

In [None]:
# assemble generated embeddings
jsonl_pattern = "openai_embeddings_results/*generated*.jsonl"

embeddings_list = []

for file_path in sorted(glob.glob(jsonl_pattern)):
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            data = json.loads(line)

            embedding = data["response"]["body"]["data"][0]["embedding"]
            embeddings_list.append(embedding)

        print(line)

embeddings_array = np.array(embeddings_list, dtype=np.float32)
print("Shape of embeddings array:", embeddings_array.shape)

# save embeddings_array
np.save("generated_embeddings_openai.npy", embeddings_array)

In [None]:
# assemble ground truth embeddings
jsonl_pattern = "openai_embeddings_results/*ground*.jsonl"

embeddings_list = []

for file_path in sorted(glob.glob(jsonl_pattern)):
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            data = json.loads(line)

            embedding = data["response"]["body"]["data"][0]["embedding"]
            embeddings_list.append(embedding)
        
        print(line)

embeddings_array = np.array(embeddings_list, dtype=np.float32)
print("Shape of embeddings array:", embeddings_array.shape)

# save embeddings_array
np.save("ground_truth_embeddings_openai.npy", embeddings_array)

In [None]:
# assemble focal embeddings
embeddings_list = []
with open("openai_embeddings_results/results_openai_embeddings_focal_papers.jsonl", 'r') as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        data = json.loads(line)

        embedding = data["response"]["body"]["data"][0]["embedding"]
        embeddings_list.append(embedding)
    
    print(line)

embeddings_array = np.array(embeddings_list, dtype=np.float32)
print("Shape of embeddings array:", embeddings_array.shape)

# save embeddings_array
np.save("focal_embeddings_openai.npy", embeddings_array)

In [2]:
# assemble focal embeddings
embeddings_list = []
with open(
    "openai_embeddings_results/results_openai_embeddings_focal_abstracts_papers.jsonl",
    'r',
) as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        data = json.loads(line)

        embedding = data["response"]["body"]["data"][0]["embedding"]
        embeddings_list.append(embedding)
    
    print(line)

embeddings_array = np.array(embeddings_list, dtype=np.float32)
print("Shape of embeddings array:", embeddings_array.shape)

# save embeddings_array
np.save("focal_abstracts_embeddings_openai.npy", embeddings_array)

{"id": "batch_req_678292b80b90819093161c8c01679212", "custom_id": "focal_id_10000", "response": {"status_code": 200, "request_id": "446796ff48540d5a8471580d547f2acb", "body": {"object": "list", "data": [{"object": "embedding", "index": 0, "embedding": [0.0030913423, -0.001623259, -0.021213423, 0.008555959, 0.009949498, 0.012474905, -0.033006772, 0.023696234, -0.008525533, -0.00942616, -0.036463235, -0.009176662, 0.04013877, -0.018742785, -0.030694352, 0.033931743, 0.019704266, -0.0033590964, -0.009833876, -0.0198138, -0.0051025404, -0.024998493, -0.026312921, -0.02609385, -0.015907025, 0.0043601315, -0.017197113, 0.004731336, -0.020130238, -0.0038398367, 0.011775093, -0.0008382528, -0.040893346, 0.035903387, -0.035659973, 0.00790483, -0.0016156523, 0.011166561, -0.0422808, -0.034783687, -0.017367503, -0.028503638, 0.03957892, 0.060463734, 0.005072114, 0.011117879, -0.008543788, -0.0045213923, -0.0028555363, -0.023367627, 0.013472898, -0.022588706, -0.0071806773, 0.012803513, 0.01016248