In [1]:
import pandas as pd
import numpy as np
import faiss
import torch
from sentence_transformers import SentenceTransformer

In [2]:
gcap_data = "/oak/stanford/groups/maggiori/GCAP/data"
df_bonds = pd.read_parquet(f"{gcap_data}/ecb1/temp/bond_isin_with_embeddings.parquet")
df_bonds.head()

Unnamed: 0,fsym_id,isin,issuance_currency,mat_date_q,iss_date_q,mat_date_d,iss_date_d,factset_entity_id,entity_name,iso_country,coupon_rate,summary,summary_embedding
0,CPN1XX-S,JP376542ANJ5,JPY,2027-04-01,2022-04-01,2027-06-18,2022-06-20,08ZVB2-E,&DO HOLDINGS CO LTD,JPN,0.52,&DO HOLDINGS CO LTD: Maturity Date 2027-06-18 ...,"[-0.024423853, 0.03775837, -0.0051595797, -0.0..."
1,DB1NV0-S,JP376542ANP2,JPY,2027-10-01,2022-10-01,2027-11-25,2022-11-25,08ZVB2-E,&DO HOLDINGS CO LTD,JPN,0.43,&DO HOLDINGS CO LTD: Maturity Date 2027-11-25 ...,"[-0.018307377, 0.03722839, -0.0063556195, -0.0..."
2,QSX908-S,JP376542APM4,JPY,2028-07-01,2023-07-01,2028-09-25,2023-09-25,08ZVB2-E,&DO HOLDINGS CO LTD,JPN,0.47,&DO HOLDINGS CO LTD: Maturity Date 2028-09-25 ...,"[-0.020252984, 0.039390735, -0.0020778805, -0...."
3,W8NLN3-S,JP376542AQF6,JPY,2027-01-01,2024-01-01,2027-03-27,2024-03-27,08ZVB2-E,&DO HOLDINGS CO LTD,JPN,1.31,&DO HOLDINGS CO LTD: Maturity Date 2027-03-27 ...,"[-0.019532233, 0.036194503, -0.005714808, -0.0..."
4,KZYCNM-S,JP376542AQE9,JPY,2029-01-01,2024-01-01,2029-02-20,2024-02-20,08ZVB2-E,&DO HOLDINGS CO LTD,JPN,0.39,&DO HOLDINGS CO LTD: Maturity Date 2029-02-20 ...,"[-0.022470271, 0.04027177, -0.0033593718, -0.0..."


In [3]:
summary_embeddings = np.stack(df_bonds['summary_embedding'])
summary_embeddings.shape

(851092, 1024)

In [4]:
d = 1024
index = faiss.IndexFlatL2(d) 
print(index.is_trained)
index.add(summary_embeddings) 
print(index.ntotal)

True
851092


In [5]:
df_bond_holdings = pd.read_csv("/oak/stanford/groups/maggiori/GCAP/data/scratch/xwfeng/bond_holdings_with_isin.csv")
df_bond_holdings.head()

Unnamed: 0,name,market_value,net_assets,fund_isin,date_q,ISIN
0,3M 0.95%,2635.0,0.25,LU2051031982,2021q4,
1,3M 0.95%,3059.0,0.23,LU2051031982,2023q4,
2,3M 0.95%,3096.0,0.23,LU2051031982,2022q4,
3,A!ac Inc. 3.625% - 15/Nov/24,3305493.0,0.52,LU0323453950,2014q4,
4,A2A SpA - Reg 1.000% 16/07/2029,312917.0,0.01,LU1720795126,2020q4,


In [6]:
x = df_bond_holdings.iloc[35]
bond_name = x["name"]
bond_name

'ADLER Group 1.875%'

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = "cpu"
model = SentenceTransformer('BAAI/bge-large-en-v1.5', device=device)

xq = model.encode(bond_name).reshape(1, -1)
k = 50                        
D, I = index.search(xq, k) 

In [10]:
top_matches = df_bonds.iloc[I[0]]['summary'].tolist()
top_matches

['ADLER GROUP SA: Maturity Date 2026-01-14 00:00:00, Coupon Rate 1.875%, Currency EUR, ISIN number is XS2283224231',
 'ADLER GROUP SA: Maturity Date 2029-01-14 00:00:00, Coupon Rate 2.25%, Currency EUR, ISIN number is XS2283225477',
 'ADLER GROUP SA: Maturity Date 2026-11-13 00:00:00, Coupon Rate 2.75%, Currency EUR, ISIN number is XS2248826294',
 'ADLER GROUP SA: Maturity Date 2023-11-23 00:00:00, Coupon Rate 2.0%, Currency EUR, ISIN number is DE000A2RUD79',
 'ADLER GROUP SA: Maturity Date 2025-07-31 00:00:00, Coupon Rate nan%, Currency EUR, ISIN number is DE000A3LMVH5',
 'ADLER FINANCING SARL: Maturity Date 2028-12-31 00:00:00, Coupon Rate 6.25%, Currency EUR, ISIN number is DE000A3L3AD6',
 'ADLER FINANCING SARL: Maturity Date 2025-06-30 00:00:00, Coupon Rate 12.5%, Currency EUR, ISIN number is DE000A3LF6J0',
 'ADLER REAL ESTATE GMBH: Maturity Date 2021-12-06 00:00:00, Coupon Rate 1.5%, Currency EUR, ISIN number is XS1731858392',
 'ADLER FINANCING SARL: Maturity Date 2029-12-31 00:00

In [11]:
from openai import OpenAI, AsyncOpenAI
openai_api_key = "None"
openai_api_base = "http://sh04-06n05:12345/v1"

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,

)

In [12]:
bond_list = "\n*".join(top_matches)
sys_prompt = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are assisting my finding the matched ISIN number from a list bonds:
{}
Return the ISIN number in JSON format.
<|end_of_text|>
""".format(bond_list)
user_prompt = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Find the ISIN number for the bond below:
<|start_bond_id|>
{}
<|end_bond_id|>
<|end_of_text|> """.format(bond_name)

In [54]:
chat_response = client.chat.completions.create(
    model="/scratch/groups/maggiori/raw_model_weights/Llama-3.3-70B-Instruct-AWQ",
    messages=[
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": user_prompt},
    ],
    max_tokens=512,
    temperature=0
)

llm_output = chat_response.choices[0].message.content
print("Chat response:\n", llm_output)

Chat response:
 The ISIN number for the bond "ABERTIS INFRSESTRUCTURAS FINANCE BV VAR PERPETUAL, ISSUE Date: 2023q4" is not found in the provided list.

However, I can suggest that the bond might be "ISIN XS2644410214: Abertis Infraestructuras SA, SUB Senior Bond/Note, EUR, COUPON 4.125 %, ISSUE Date 2023q3" which is close to the issue date you provided, but it's not an exact match.

If you're looking for a specific ISIN number, I recommend checking the list again or providing more information about the bond.

Here is the response in JSON format:
```json
{
  "ISIN": null,
  "message": "ISIN number not found in the provided list."
}
```
