In [1]:
import os
if os.getcwd().split('/')[-1] == 'ditto_processing':
    os.chdir('../../')
os.getcwd()

'/home/claude/development/datainc/datainc_code'

In [2]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm


In [3]:
entity_data_path = 'data/raw/wdc_80pair/wdc_80pair.csv'
 
train_matches_path = 'data/raw/wdc_80pair/train.csv'
val_matches_path = 'data/raw/wdc_80pair/val.csv'
test_matches_path = 'data/raw/wdc_80pair/test.csv'
 
graph_test_candidates_path = 'data/processed/wdc/full_test_candidates.csv'

In [16]:
entity_df = pd.read_csv(entity_data_path)
entity_df.set_index('id', inplace=True)

display(entity_df[:3])
print(entity_df.shape)

Unnamed: 0_level_0,brand,title,description,price,priceCurrency
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
533407,Western Digital,WD Blue SN550 1TB M.2 SSD,"(Solid, 1TB, Blue, Drive), M.2, SN550, SSD, St...",119.0,EUR
22587878,,TP-Link AC600 Nano Wireless USB Adapter,"High Speed WiFi, Dual Band Wireless, Nano desi...",12.48,GBP
25725051,AMD,AMD Ryzen 5 2600X - 4.25 GHz - 6-core - 12 thr...,"IcecatLive.getDatasheet('#IcecatLive',{'UserNa...",296.39,EUR


(4841, 5)


In [17]:
def ditto_encode_record(row):
    accu = []
    col_names = list(row.index)
    if 'id' in col_names:
        col_names.remove('id')
    if 'ditto_encoded' in col_names:
        col_names.remove('ditto_encoded')
    if 'Unnamed: 0' in col_names:
        col_names.remove('Unnamed: 0')

    for col_name in col_names:
        value = '-' if pd.isna(row[col_name]) else row[col_name]
        accu.append(f"COL {col_name} VAL {value}")
    return " ".join(accu)

entity_df['ditto_encoded'] = entity_df.apply(ditto_encode_record, axis=1)

In [18]:
data = dict()
path_list = [
    ('train', train_matches_path),
    ('val', val_matches_path),
    ('test', test_matches_path),
    ('graph_test_candidates', graph_test_candidates_path)
]

for name, path in path_list:
    data[name] = pd.read_csv(path)
    
    print(f"{name} ({data[name].shape[0]})")
    #display(data[name][:3])


train (19835)
val (4500)
test (4500)
graph_test_candidates (9166)


In [19]:
def build_ditto_dataset(df, file_path):
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Removed existing file at {file_path}")
    
    data = []

    for i, row in tqdm(df.iterrows(), total=df.shape[0], desc='Encoding Rows for Ditto...'):
        lid, rid, label = row['lid'], row['rid'], row['label']

        left_encoded = entity_df.loc[lid, 'ditto_encoded'].replace('\t', ' ').replace('\r', ' ').replace('\n', ' ')
        right_encoded = entity_df.loc[rid, 'ditto_encoded'].replace('\t', ' ').replace('\r', ' ').replace('\n', ' ')

        data.append(f"{left_encoded}\t{right_encoded}\t{label}")

    with open(file_path, 'w') as f:
        for line in tqdm(data, total=len(data), desc='Writing to file...'):
            f.write(line)
            f.write(os.linesep)

    print(f"Saved at: {file_path}")

In [20]:
os.makedirs('../datainc_code/data/processed/ditto/wdc_products', exist_ok=True)

build_ditto_dataset(data['train'], '../datainc_code/data/processed/ditto/wdc_products/train.txt')
build_ditto_dataset(data['val'], '../datainc_code/data/processed/ditto/wdc_products/val.txt')
build_ditto_dataset(data['test'], '../datainc_code/data/processed/ditto/wdc_products/test.txt')

build_ditto_dataset(data['graph_test_candidates'], '../datainc_code/data/processed/ditto/wdc_products/graph_test_candidates.txt')

Encoding Rows for Ditto...:   0%|          | 0/19835 [00:00<?, ?it/s]

Writing to file...:   0%|          | 0/19835 [00:00<?, ?it/s]

Saved at: ../datainc_code/data/processed/ditto/wdc_products/train.txt


Encoding Rows for Ditto...:   0%|          | 0/4500 [00:00<?, ?it/s]

Writing to file...:   0%|          | 0/4500 [00:00<?, ?it/s]

Saved at: ../datainc_code/data/processed/ditto/wdc_products/val.txt


Encoding Rows for Ditto...:   0%|          | 0/4500 [00:00<?, ?it/s]

Writing to file...:   0%|          | 0/4500 [00:00<?, ?it/s]

Saved at: ../datainc_code/data/processed/ditto/wdc_products/test.txt


Encoding Rows for Ditto...:   0%|          | 0/9166 [00:00<?, ?it/s]

Writing to file...:   0%|          | 0/9166 [00:00<?, ?it/s]

Saved at: ../datainc_code/data/processed/ditto/wdc_products/graph_test_candidates.txt
