In [54]:
import pandas as pd
import numpy as np
import re
from typing import List, Tuple, Dict, Any, Set, Callable, Hashable

In [55]:
column_names = [
  # 'rec_id',
  'given_name',
  'surname',
  'street_number',
  'address_1',
  'address_2',
  'suburb',
  # 'postcode',
  'state',
  'date_of_birth',
  # 'age',
  'phone_number',
  # 'soc_sec_id',
  # 'blocking_number'
]

In [56]:

def clean_address(address: str) -> str:
    # Список слов и сокращений для удаления
    words_to_remove = [
        "улица", "ул\\. ?", "проспект", "пр-т\\. ?", "пр\\. ?", "бульвар", "б-р\\. ?", "переулок", "пер\\. ?", "набережная", "наб\\. ?",
        "шоссе", "площадь", "пл\\. ?", "дом", "д\\. ?", "квартира", "кв\\. ?", "корпус", "корп\\. ?", "строение", "стр\\. ?", "область",
        "обл\\. ?", "город", "г\\. ?", "поселок", "пос\\. ?", "деревня", "дер\\. ?",
        "street", "st\\. ?", "avenue", "ave\\. ?", "boulevard", "blvd\\. ?", "alley", "al\\. ?", "drive", "dr\\. ?",
        "square", "sq\\. ?", "house", "h\\. ?", "apartment", "apt\\. ?", "building", "bldg\\. ?", "county", "co\\. ?",
        "city", "ct\\. ?", "village", "vil\\. ?", "township", "twp\\. ?", "road", "rd\\. ?"
    ]

    # Создаем регулярное выражение из списка слов и сокращений
    pattern = r'\b(?:{})\b'.format('|'.join(words_to_remove))

    # Заменяем найденные слова и сокращения на пустую строку
    cleaned_address = re.sub(pattern, '', address, flags=re.IGNORECASE)

    # Удаляем лишние пробелы и возвращаем очищенную строку
    return re.sub(r'\s+', ' ', cleaned_address).strip()

In [57]:
def get_key(x: str) -> str:
    # Формируем ключ из значений полей
    return  '-'.join([x.split('-')[0], x.split('-')[1]])

In [58]:
data = pd.read_csv('test.csv', dtype=str)
data = data.applymap(lambda x:  x.strip() if isinstance(x, str) else x )
# data['address_1'] = data['address_1'].apply(lambda x:  clean_address(x) if isinstance(x, str) else x )
# data['address_2'] = data['address_2'].apply(lambda x:  clean_address(x) if isinstance(x, str) else x )
# data = data.applymap(lambda x:  clean_address(x) if isinstance(x, str) else x )
data = data.replace({np.nan:None})
data['rec_common_id'] = data.apply(lambda x: get_key(x['rec_id']), axis=1)
data_list = data.to_dict('records')

In [59]:
data_by_ids = {}
expected_res = {}
origs = {}
for record in data_list:
    if record['rec_common_id'] not in data_by_ids:
        data_by_ids[record['rec_common_id']] = []
        expected_res[record['rec_common_id']] = set()
    data_by_ids[record['rec_common_id']].append(record)
    expected_res[record['rec_common_id']].add(record['rec_id'])
    if 'org' in record['rec_id']:
        origs[record['rec_id']] = record

In [60]:
from distances.levenshtein import levenstein_similarity, levenshtein_distance_memopt
from distances.jaro import jaro_winkler_similarity
from distances.damerau_levenstein import damerau_levenshtein_similarity, damerau_levenshtein_distance_memopt
from distances.jaccard import jaccard_similarity_str

In [61]:
def get_sim_mean(a: Dict[Hashable, Any], b: Dict[Hashable, Any], sim: Callable[[str, str], float]) -> float:
  sm = 0
  cnt = 0
  for k in column_names:
    if a[k] is not None and b[k] is not None and isinstance(a[k], str) and a[k] != '' and isinstance(b[k], str) and b[k] != '':
      sm += sim(a[k], b[k])
      cnt += 1
  if cnt == 0:
    return 0
  return sm / cnt
def get_lev_sim(a: Dict[Hashable, Any], b: Dict[Hashable, Any]) -> float:
  return get_sim_mean(a, b, levenstein_similarity)
def get_dam_lev_sim(a: Dict[Hashable, Any], b: Dict[Hashable, Any]) -> float:
  return get_sim_mean(a, b, damerau_levenshtein_similarity)
def get_jaro_sim(a: Dict[Hashable, Any], b: Dict[Hashable, Any]) -> float:
  return get_sim_mean(a, b, jaro_winkler_similarity)
def get_jaccard_sim(a: Dict[Hashable, Any], b: Dict[Hashable, Any]) -> float:
  return get_sim_mean(a, b, jaccard_similarity_str)

In [62]:
# test_data = data_by_ids['rec-148']
# main_rec = test_data[-1]
# test_data = test_data[:-1]
# test_data.append(data_by_ids['rec-100'][0])
# for td in test_data:
#   sim = get_lev_sim(main_rec, td)
#   print(f'main_rec_id: {main_rec["rec_id"]}, td_rec_id: {td["rec_id"]}, similarity: {sim}')

In [63]:
# for k in column_names:
#   if main_rec[k] is not None and isinstance(main_rec[k], str):
#     print(f'{k}: {main_rec[k]}')
# for k in column_names:
#   if test_data[2][k] is not None and isinstance(test_data[2][k], str):
#     print(f'{k}: {test_data[2][k]}')

In [64]:
def test_distances(data: List[Dict[Hashable, Any]], expected_res: Dict[str, Set[str]], similarity: Callable[[Dict[Hashable, Any], Dict[Hashable, Any]], float],threshold: float = 0.85) -> Tuple[int, int, int]:
  tp = 0
  fp = 0
  fn = 0
  res = []
  for i in range(len(data)):
    for j in range(i + 1, len(data)):
      sim = similarity(data[i], data[j])
      if sim > threshold:
        res.append((data[i]['rec_id'], data[j]['rec_id'], sim))
  res_by_ids = {}
  for (a_id, b_id, sim) in res:
    a_key = get_key(a_id)
    b_key = get_key(b_id)
    if a_key != b_key:
      print(f'ERROR: {a_id}, {b_id}, {sim}')
      fp += 1
      continue
    if a_key not in res_by_ids:
      res_by_ids[a_key] = set()
    res_by_ids[a_key].add(a_id)
    res_by_ids[a_key].add(b_id)
  for key in expected_res:
    if key not in res_by_ids:
      if len(expected_res[key]) > 1:
        # print(f'ERROR: not found {key}')
        fn += len(expected_res[key])
      continue
    # if len(expected_res[key]) != len(res_by_ids[key]):
    #   print(f'ERROR: not full {key}')
    fn += len(expected_res[key]) - len(res_by_ids[key])
    tp += len(res_by_ids[key])
  return tp, fp, fn

In [65]:
(tp, fp, fn) = test_distances(data_list, expected_res, get_lev_sim)
print('LEVENSTEIN')
print(f'tp: {tp}, fp: {fp}, fn: {fn}')
precision = tp / (tp + fp)
recall = tp / (tp + fn)
print(f'precision: {precision}')
print(f'recall: {recall}')
print(f'f1: {2 * precision * recall / (precision + recall)}')

LEVENSTEIN
tp: 464, fp: 0, fn: 77
precision: 1.0
recall: 0.8576709796672828
f1: 0.9233830845771144


In [66]:
(tp, fp, fn) = test_distances(data_list, expected_res, get_jaro_sim)
print('JARO')
print(f'tp: {tp}, fp: {fp}, fn: {fn}')
precision = tp / (tp + fp)
recall = tp / (tp + fn)
print(f'precision: {precision}')
print(f'recall: {recall}')
print(f'f1: {2 * precision * recall / (precision + recall)}')

JARO
tp: 468, fp: 0, fn: 73
precision: 1.0
recall: 0.8650646950092421
f1: 0.9276511397423192


In [67]:
(tp, fp, fn) = test_distances(data_list, expected_res, get_dam_lev_sim)
print('DAMERAU LEVENSTEIN')
print(f'tp: {tp}, fp: {fp}, fn: {fn}')
precision = tp / (tp + fp)
recall = tp / (tp + fn)
print(f'precision: {precision}')
print(f'recall: {recall}')
print(f'f1: {2 * precision * recall / (precision + recall)}')

DAMERAU LEVENSTEIN
tp: 469, fp: 0, fn: 72
precision: 1.0
recall: 0.866913123844732
f1: 0.9287128712871286


In [93]:
(tp, fp, fn) = test_distances(data_list, expected_res, get_jaccard_sim)
print('JACCARD')
print(f'tp: {tp}, fp: {fp}, fn: {fn}')
precision = tp / (tp + fp)
recall = tp / (tp + fn)
print(f'precision: {precision}')
print(f'recall: {recall}')
print(f'f1: {2 * precision * recall / (precision + recall)}')

JACCARD
tp: 477, fp: 0, fn: 64
precision: 1.0
recall: 0.8817005545286506
f1: 0.93713163064833


In [69]:
from dcs import dcs_plus_plus
from dm_soundex import encode

In [94]:
def dcs_key(r: Dict[Hashable, Any]) -> str:
  name = r['given_name']
  sur = r['surname']
  if name is None:
    name = ''
  # elif name != '':
  #   name = encode(name, max_length=10, zero_pad=True)
  if sur is None:
    sur = ''
  # elif sur != '':
  #   sur = encode(sur, max_length=10, zero_pad=True)
  s = f'{name}{sur}'
  if s == '':
    return '0'*10
  # print(encode(r['surname'], max_length=10, zero_pad=True))
  return s
def is_duplicate(r1: Dict[Hashable, Any], r2: Dict[Hashable, Any]) -> bool:
  return get_jaccard_sim(r1, r2) > 0.85

In [71]:
def res_stats(res: List[Tuple[Dict[Hashable, Any], Dict[Hashable, Any]]], expected_res) -> None:
  tp = 0
  fp = 0
  fn = 0
  res_by_ids = {}
  for (a, b) in res:
    a_id = a['rec_id']
    b_id = b['rec_id']
    if a_id == b_id:
      # print(f'ERROR: {a_id}, {b_id}')
      continue
    a_key = get_key(a_id)
    b_key = get_key(b_id)
    if a_key != b_key:
      # print(f'ERROR: {a_id}, {b_id}, {sim}')
      fp += 1
      continue
    if a_key not in res_by_ids:
      res_by_ids[a_key] = set()
    res_by_ids[a_key].add(a_id)
    res_by_ids[a_key].add(b_id)
  for key in expected_res:
    if key not in res_by_ids:
      if len(expected_res[key]) > 1:
        # print(f'ERROR: not found {key}')
        fn += len(expected_res[key])
      continue
    # if len(expected_res[key]) != len(res_by_ids[key]):
    #   print(f'ERROR: not full {key}')
    fn += len(expected_res[key]) - len(res_by_ids[key])
    tp += len(res_by_ids[key])
  print(f'tp: {tp}, fp: {fp}, fn: {fn}')
  precision = tp / (tp + fp)
  recall = tp / (tp + fn)
  print(f'precision: {precision}')
  print(f'recall: {recall}')
  print(f'f1: {2 * precision * recall / (precision + recall)}')

In [95]:
res = dcs_plus_plus(data_list, dcs_key, is_duplicate=is_duplicate, w=20)

['5bad66a9-849a-4432-b691-c435b78114e4', 'dbb64336-14fe-4d2f-a56d-92143330e5ab', '8add3567-81c1-472b-ad8e-149c8a4072fe', 'acb86e89-71b2-4a06-8e6a-b7d00c7f74bd', '59353a59-b53b-4a50-a8c5-503b95d57e9a', 'ad85e53f-e2ec-426e-bef8-66df721112f3', 'ea24d742-1044-4ff6-9fd7-bab55aebada6', '2c13bc07-680e-444c-ab82-067f4c38e1f4', 'ee46e3e2-3a29-4006-a3be-47bcd87ed0c0', '1c728458-ede3-4323-a3e8-850c2c264f7b']


In [96]:
res_stats(res, expected_res)

tp: 405, fp: 0, fn: 136
precision: 1.0
recall: 0.7486136783733827
f1: 0.8562367864693446


In [87]:
def get_lev_dist(a: Dict[Hashable, Any], b: Dict[Hashable, Any]) -> int:
  a_s = ''
  b_s = ''
  for k in column_names:
    if a[k] is not None and isinstance(a[k], str) and a[k] != '' and b[k] is not None and isinstance(b[k], str) and b[k] != '':
      a_s += a[k]
      b_s += b[k]
  return damerau_levenshtein_distance_memopt(a_s, b_s)

In [75]:
from bk_tree import BKTree

In [88]:
bkt = BKTree(get_lev_dist)
for r in data_list:
  bkt.insert(r)


In [91]:
res: List[Tuple[Dict[Hashable, Any], Dict[Hashable, Any]]] = []
qres = []
for org in origs.values():
  q_res = bkt.query(org, 15)
  for (_,r) in q_res:
    if r['rec_id'] != org['rec_id']:
      res.append((org, r))

In [92]:
res_stats(res, expected_res)

tp: 513, fp: 7, fn: 28
precision: 0.9865384615384616
recall: 0.9482439926062847
f1: 0.9670122525918945
