In [1]:
import os
if os.getcwd().split('/')[-1] == 'ditto_processing':
    os.chdir('../../')
os.getcwd()

'/home/lehl/development/datainc/datainc_code'

In [2]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from scipy.sparse import csr_matrix
import swifter

import nltk
from nltk.corpus import stopwords

In [9]:
entity_data_path = 'data/raw/synthetic_data/seed_0/synthetic_companies_dataset_seed_0_size_868254_sorted.csv'

#train_matches_path = 'data/raw/synthetic_data/seed_0/companies/train.csv'
#val_matches_path = 'data/raw/synthetic_data/seed_0/companies/val.csv'
#test_matches_path = 'data/raw/synthetic_data/seed_0/companies/test.csv'

train_matches_path = 'data/processed/synthetic_companies/seed_44/train__pre_split__all_matches.csv'
test_matches_path = 'data/processed/synthetic_companies/seed_44/test__pre_split__all_matches.csv'
val_matches_path = 'data/processed/synthetic_companies/seed_44/valid__pre_split__all_matches.csv'

graph_test_candidates_path = 'data/processed/synthetic_companies/full_test_candidates.csv'

In [4]:
entity_df = pd.read_csv(entity_data_path)
entity_df = entity_df.drop(columns=['inserted', 'last_modified'])

entity_columns = [
    'data_source_id', 'external_id',
    'name', 'city', 'region', 'country_code',
    'short_description'
]
entity_df = entity_df[entity_columns]

display(entity_df[:3])
print(entity_df.shape)

  entity_df = pd.read_csv(entity_data_path)


Unnamed: 0,data_source_id,external_id,name,city,region,country_code,short_description
0,1,10000041,Portsmith SYS,Boise,Idaho,US,Design and manufacture of innovative enterpris...
1,1,10000140,LAWYEAH!,Madrid,Madrid,ESP,LawYeah! provides tools and legal services onl...
2,1,10000426,Reportive Prof. Co.,,,,Reportive Prof. Co..in is News and Media websi...


(868254, 7)


In [5]:
def remove_stopwords(text):
    words = str(text).split()
    filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]
    return ' '.join(filtered_words)

entity_df['short_description'] = entity_df['short_description'] \
    .swifter \
    .allow_dask_on_strings(True) \
    .apply(remove_stopwords)

display(entity_df[:3])

Dask Apply:   0%|          | 0/16 [00:00<?, ?it/s]

Unnamed: 0,data_source_id,external_id,name,city,region,country_code,short_description
0,1,10000041,Portsmith SYS,Boise,Idaho,US,Design manufacture innovative enterprise produ...
1,1,10000140,LAWYEAH!,Madrid,Madrid,ESP,LawYeah! provides tools legal services online ...
2,1,10000426,Reportive Prof. Co.,,,,Reportive Prof. Co..in News Media website cove...


In [6]:
def ditto_encode_record(row):
    accu = []
    col_names = list(row.index)
    if 'id' in col_names:
        col_names.remove('id')
    if 'ditto_encoded' in col_names:
        col_names.remove('ditto_encoded')
    if 'Unnamed: 0' in col_names:
        col_names.remove('Unnamed: 0')

    for col_name in col_names:
        value = '-' if pd.isna(row[col_name]) else row[col_name]
        accu.append(f"COL {col_name} VAL {value}")
    return " ".join(accu)

entity_df['ditto_encoded'] = entity_df.swifter.apply(ditto_encode_record, axis=1)

Pandas Apply:   0%|          | 0/868254 [00:00<?, ?it/s]

In [10]:
data = dict()
path_list = [
    ('train', train_matches_path),
    ('val', val_matches_path),
    ('test', test_matches_path),
    ('graph_test_candidates', graph_test_candidates_path)
]

for name, path in path_list:
    data[name] = pd.read_csv(path)

    if 'Unnamed: 0' in data[name].columns:
        data[name] = data[name].drop(columns=['Unnamed: 0'])
    
    print(f"{name} ({data[name].shape[0]})")
    #display(data[name][:3])


train (4500400)
val (1499495)
test (1499495)
graph_test_candidates (1140674)


In [77]:
def _generate_sparse_match_matrix(df: pd.DataFrame, max_id=None):
    if not max_id:
        max_id = np.max(df[['lid', 'rid']].max().to_numpy())
    data = np.ones(len(df), dtype=np.bool_)
    rows = df['lid']
    cols = df['rid']

    # Instantiated with both row-col and col-row, to include
    # both variants (A,B) and (B,A) as "already taken"
    #
    sparse_data = np.concatenate([data, data])
    sparse_row_ind = np.concatenate([rows, cols])
    sparse_col_ind = np.concatenate([cols, rows])
    sparse_matrix = csr_matrix((sparse_data, (sparse_row_ind, sparse_col_ind)), shape=(max_id + 1, max_id + 1))

    return sparse_matrix, max_id

In [78]:
df_test = data['test']
df_train = data['train']
df_val = data['val']

validation_idxs = np.array(sorted(set(df_val[df_val['label'] == 1]['lid'].tolist() + df_val[df_val['label'] == 1]['rid'].tolist())))

# Calculate the maximum index in the datasets
max_id = entity_df.shape[0] - 1

m1, _ = _generate_sparse_match_matrix(df_train, max_id=max_id)
m2, _ = _generate_sparse_match_matrix(df_val, max_id=max_id)
m3, _ = _generate_sparse_match_matrix(df_test, max_id=max_id)
match_matrix = m1 + m2 + m3
match_matrix

<868254x868254 sparse matrix of type '<class 'numpy.bool_'>'
	with 7199856 stored elements in Compressed Sparse Row format>

In [79]:
num_nonmatches_to_generate = df_val.shape[0] * 4
result = list()

pbar = tqdm(total=num_nonmatches_to_generate + 1,
            desc=f'[{name}] Building random nonmatches for the validation set...')

while len(result) < num_nonmatches_to_generate:
    row_id = np.random.choice(validation_idxs)
    col_id = np.random.randint(0, match_matrix.shape[1] - 1)

    if match_matrix[row_id, col_id] == 0:
        result.append((row_id, col_id))
        result.append((col_id, row_id))
        pbar.update(2)

pbar.close()

df_nonmatches = pd.DataFrame(result, columns=['lid', 'rid'])
df_nonmatches['label'] = 0
df_nonmatches

[test] Building random nonmatches for the validation set...:   0%|          | 0/1199597 [00:00<?, ?it/s]

Unnamed: 0,lid,rid,label
0,685300,473333,0
1,473333,685300,0
2,283485,730027,0
3,730027,283485,0
4,33318,519900,0
...,...,...,...
1199591,213256,389879,0
1199592,379415,235486,0
1199593,235486,379415,0
1199594,219178,384737,0


In [82]:
df_val__with_nonmatches = pd.concat([df_val, df_nonmatches]).reset_index(drop=True)
df_val__with_nonmatches.to_csv('data/processed/synthetic_companies/seed_44/valid__pre_split__all_matches.csv', index=False)
display(df_val__with_nonmatches[:3])


Unnamed: 0,lid,rid,label
0,192336,360983,1
1,192336,419579,1
2,192336,548537,1


In [12]:
def build_ditto_dataset(df, file_path):
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Removed existing file at {file_path}")
    
    data = []

    for i, row in tqdm(df.iterrows(), total=df.shape[0], desc='Encoding Rows for Ditto...'):
        lid, rid, label = row['lid'], row['rid'], row['label']

        left_encoded = entity_df.loc[lid, 'ditto_encoded'].replace('\t', ' ').replace('\r', ' ').replace('\n', ' ')
        right_encoded = entity_df.loc[rid, 'ditto_encoded'].replace('\t', ' ').replace('\r', ' ').replace('\n', ' ')

        data.append(f"{left_encoded}\t{right_encoded}\t{label}")

    with open(file_path, 'w') as f:
        for line in tqdm(data, total=len(data), desc='Writing to file...'):
            f.write(line)
            f.write(os.linesep)

    print(f"Saved at: {file_path}")

In [13]:
os.makedirs('../datainc_code/data/processed/ditto/finance/synthetic_companies', exist_ok=True)

#build_ditto_dataset(data['train'], '../datainc_code/data/processed/ditto/finance/synthetic_companies/train.txt')
#build_ditto_dataset(data['val'], '../datainc_code/data/processed/ditto/finance/synthetic_companies/val.txt')
#build_ditto_dataset(data['test'], '../datainc_code/data/processed/ditto/finance/synthetic_companies/test.txt')

build_ditto_dataset(data['graph_test_candidates'], '../datainc_code/data/processed/ditto/finance/synthetic_companies/graph_test_candidates.txt')


Encoding Rows for Ditto...:   0%|          | 0/1140674 [00:00<?, ?it/s]

Writing to file...:   0%|          | 0/1140674 [00:00<?, ?it/s]

Saved at: ../datainc_code/data/processed/ditto/finance/synthetic_companies/graph_test_candidates.txt
