In [1]:
import os
if os.getcwd().split('/')[-1] == 'ditto_processing':
    os.chdir('../../')
os.getcwd()

'/home/claude/development/datainc/datainc_code'

In [2]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm


In [3]:
entity_data_path = 'data/raw/synthetic_data/seed_0/synthetic_securities_dataset_seed_0_size_984942_sorted.csv'


train_matches_path = 'data/processed/synthetic_securities/seed_44/train__pre_split__all_matches.csv'
val_matches_path = 'data/processed/synthetic_securities/seed_44/val__pre_split__all_matches.csv'
test_matches_path = 'data/processed/synthetic_securities/seed_44/test__pre_split__all_matches.csv'

graph_test_candidates_path = 'data/processed/synthetic_securities/full_test_candidates__ditto.csv'

In [4]:
entity_df = pd.read_csv(entity_data_path)
entity_df = entity_df.drop(columns=['inserted', 'last_modified'])

entity_columns = ['data_source_id', 'issuer_id', 'external_id', 'name', 'ISIN', 'CUSIP', 'VALOR', 'SEDOL', 'primary_currency', 'type']
entity_df = entity_df[entity_columns]

display(entity_df[:3])
print(entity_df.shape)

Unnamed: 0,data_source_id,issuer_id,external_id,name,ISIN,CUSIP,VALOR,SEDOL,primary_currency,type
0,1,10000041,97091953,Portsmith SYS Ordinary Share,US30443RBJG1,30443RBJG,122375411.0,STFRDE,USD,Ordinary Share
1,1,10000140,60004107,LAWYEAH! Dividend Rights,ES0486TV0357,,,YNDLPY,Euro,Class A Rights
2,1,10000140,96954230,LAWYEAH! Ordinary Share,ES7021LZL525,QZ95QFC3G,108782287.0,BZDUJG,EUR,Ordinary Share


(984942, 10)


In [5]:
def ditto_encode_record(row):
    accu = []
    col_names = list(row.index)
    if 'id' in col_names:
        col_names.remove('id')
    if 'ditto_encoded' in col_names:
        col_names.remove('ditto_encoded')
    if 'Unnamed: 0' in col_names:
        col_names.remove('Unnamed: 0')

    for col_name in col_names:
        value = '-' if pd.isna(row[col_name]) else row[col_name]
        accu.append(f"COL {col_name} VAL {value}")
    return " ".join(accu)

entity_df['ditto_encoded'] = entity_df.apply(ditto_encode_record, axis=1)

In [6]:
data = dict()
path_list = [
    ('train', train_matches_path),
    ('val', val_matches_path),
    ('test', test_matches_path),
    ('graph_test_candidates', graph_test_candidates_path)
]

for name, path in path_list:
    data[name] = pd.read_csv(path)
    
    print(f"{name} ({data[name].shape[0]})")
    #display(data[name][:3])


train (4531795)
val (1510780)
test (1511080)
graph_test_candidates (326296)


In [7]:
def build_ditto_dataset(df, file_path):
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Removed existing file at {file_path}")
    
    data = []

    for i, row in tqdm(df.iterrows(), total=df.shape[0], desc='Encoding Rows for Ditto...'):
        lid, rid, label = row['lid'], row['rid'], row['label']

        left_encoded = entity_df.loc[lid, 'ditto_encoded'].replace('\t', ' ').replace('\r', ' ').replace('\n', ' ')
        right_encoded = entity_df.loc[rid, 'ditto_encoded'].replace('\t', ' ').replace('\r', ' ').replace('\n', ' ')

        data.append(f"{left_encoded}\t{right_encoded}\t{label}")

    with open(file_path, 'w') as f:
        for line in tqdm(data, total=len(data), desc='Writing to file...'):
            f.write(line)
            f.write(os.linesep)

    print(f"Saved at: {file_path}")

In [9]:
os.makedirs('../datainc_code/data/processed/ditto/finance/synthetic_securities', exist_ok=True)

#build_ditto_dataset(data['train'], '../datainc_code/data/processed/ditto/finance/synthetic_securities/train.txt')
#build_ditto_dataset(data['val'], '../datainc_code/data/processed/ditto/finance/synthetic_securities/val.txt')
#build_ditto_dataset(data['test'], '../datainc_code/data/processed/ditto/finance/synthetic_securities/test.txt')

build_ditto_dataset(data['graph_test_candidates'], '../datainc_code/data/processed/ditto/finance/synthetic_securities/graph_test_candidates.txt')

Encoding Rows for Ditto...:   0%|          | 0/326296 [00:00<?, ?it/s]

Writing to file...:   0%|          | 0/326296 [00:00<?, ?it/s]

Saved at: ../datainc_code/data/processed/ditto/finance/synthetic_securities/graph_test_candidates.txt
