In [145]:
import pandas as pd
import numpy as np
import re
from typing import List, Tuple, Dict, Any, Set, Callable, Hashable

In [146]:
column_names = [
  # 'rec_id',
  'given_name',
  'surname',
  'street_number',
  'address_1',
  'address_2',
  'suburb',
  # 'postcode',
  'state',
  'date_of_birth',
  # 'age',
  'phone_number',
  # 'soc_sec_id',
  # 'blocking_number'
]

In [118]:

def clean_address(address: str) -> str:
    # Список слов и сокращений для удаления
    words_to_remove = [
        "улица", "ул\\. ?", "проспект", "пр-т\\. ?", "пр\\. ?", "бульвар", "б-р\\. ?", "переулок", "пер\\. ?", "набережная", "наб\\. ?",
        "шоссе", "площадь", "пл\\. ?", "дом", "д\\. ?", "квартира", "кв\\. ?", "корпус", "корп\\. ?", "строение", "стр\\. ?", "область",
        "обл\\. ?", "город", "г\\. ?", "поселок", "пос\\. ?", "деревня", "дер\\. ?",
        "street", "st\\. ?", "avenue", "ave\\. ?", "boulevard", "blvd\\. ?", "alley", "al\\. ?", "drive", "dr\\. ?",
        "square", "sq\\. ?", "house", "h\\. ?", "apartment", "apt\\. ?", "building", "bldg\\. ?", "county", "co\\. ?",
        "city", "ct\\. ?", "village", "vil\\. ?", "township", "twp\\. ?", "road", "rd\\. ?"
    ]

    # Создаем регулярное выражение из списка слов и сокращений
    pattern = r'\b(?:{})\b'.format('|'.join(words_to_remove))

    # Заменяем найденные слова и сокращения на пустую строку
    cleaned_address = re.sub(pattern, '', address, flags=re.IGNORECASE)

    # Удаляем лишние пробелы и возвращаем очищенную строку
    return re.sub(r'\s+', ' ', cleaned_address).strip()

In [147]:
def get_key(x: str) -> str:
    # Формируем ключ из значений полей
    return  '-'.join([x.split('-')[0], x.split('-')[1]])

In [148]:
data = pd.read_csv('test.csv')
data = data.applymap(lambda x:  x.strip() if isinstance(x, str) else x )
# data['address_1'] = data['address_1'].apply(lambda x:  clean_address(x) if isinstance(x, str) else x )
# data['address_2'] = data['address_2'].apply(lambda x:  clean_address(x) if isinstance(x, str) else x )
# data = data.applymap(lambda x:  clean_address(x) if isinstance(x, str) else x )
data = data.replace({np.nan:None})
data['rec_common_id'] = data.apply(lambda x: get_key(x['rec_id']), axis=1)
data_list = data.to_dict('records')

In [149]:
data_by_ids = {}
expected_res = {}
for record in data_list:
    if record['rec_common_id'] not in data_by_ids:
        data_by_ids[record['rec_common_id']] = []
        expected_res[record['rec_common_id']] = set()
    data_by_ids[record['rec_common_id']].append(record)
    expected_res[record['rec_common_id']].add(record['rec_id'])

In [150]:
from distances.levenshtein import levenstein_similarity

In [151]:
def get_lev_sim(a: Dict[Hashable, Any], b: Dict[Hashable, Any]) -> float:
  sm = 0
  cnt = 0
  for k in column_names:
    if a[k] is not None and b[k] is not None and isinstance(a[k], str) and a[k] != '' and isinstance(b[k], str) and b[k] != '':
      # distance = levenstein_similarity(main_rec[k], td[k])
      # max_length = max(len(main_rec[k]), len(td[k]))
      sm += levenstein_similarity(a[k], b[k])
      cnt += 1
  return sm / cnt

In [152]:
test_data = data_by_ids['rec-148']
main_rec = test_data[-1]
test_data = test_data[:-1]
test_data.append(data_by_ids['rec-100'][0])
for td in test_data:
  sim = get_lev_sim(main_rec, td)
  print(f'main_rec_id: {main_rec["rec_id"]}, td_rec_id: {td["rec_id"]}, similarity: {sim}')

main_rec_id: rec-148-org, td_rec_id: rec-148-dup-2, similarity: 0.9692307692307693
main_rec_id: rec-148-org, td_rec_id: rec-148-dup-1, similarity: 0.9446153846153846
main_rec_id: rec-148-org, td_rec_id: rec-148-dup-0, similarity: 0.825974025974026
main_rec_id: rec-148-org, td_rec_id: rec-100-dup-0, similarity: 0.188997113997114


In [153]:
for k in column_names:
  if main_rec[k] is not None and isinstance(main_rec[k], str):
    print(f'{k}: {main_rec[k]}')
for k in column_names:
  if test_data[2][k] is not None and isinstance(test_data[2][k], str):
    print(f'{k}: {test_data[2][k]}')

given_name: rebecca
surname: rees
address_1: belconnen way
suburb: kedron
phone_number: 07 81960884
given_name: rebwcca
surname: rees
address_1: belconnen way
suburb: kedron
phone_number: 02 13450400


In [126]:
res = []
for i in range(len(data_list)):
  for j in range(i + 1, len(data_list)):
    sim = get_lev_sim(data_list[i], data_list[j])
    if sim > 0.85:
      res.append((data_list[i]['rec_id'], data_list[j]['rec_id'], sim))

In [131]:
tp = 0
fp = 0
fn = 0

In [132]:
res_by_ids = {}
for (a_id, b_id, sim) in res:
  a_key = get_key(a_id)
  b_key = get_key(b_id)
  if a_key != b_key:
    print(f'ERROR: {a_id}, {b_id}, {sim}')
    fp += 1
    continue
  if a_key not in res_by_ids:
    res_by_ids[a_key] = set()
  res_by_ids[a_key].add(a_id)
  res_by_ids[a_key].add(b_id)

In [133]:
for key in expected_res:
  if key not in res_by_ids:
    if len(expected_res[key]) > 1:
      print(f'ERROR: not found {key}')
      fn += len(expected_res[key])
    continue
  if len(expected_res[key]) != len(res_by_ids[key]):
    print(f'ERROR: not full {key}')
  fn += len(expected_res[key]) - len(res_by_ids[key])
  tp += len(res_by_ids[key])
  # print(f'OK: {key}')
  # break

ERROR: not full rec-128
ERROR: not full rec-74
ERROR: not full rec-199
ERROR: not full rec-167
ERROR: not full rec-112
ERROR: not full rec-121
ERROR: not full rec-144
ERROR: not full rec-188
ERROR: not full rec-84
ERROR: not full rec-113
ERROR: not full rec-104
ERROR: not full rec-172
ERROR: not full rec-97
ERROR: not full rec-49
ERROR: not full rec-129
ERROR: not full rec-142
ERROR: not full rec-179
ERROR: not full rec-51
ERROR: not found rec-66
ERROR: not full rec-158
ERROR: not full rec-61
ERROR: not full rec-80
ERROR: not full rec-147
ERROR: not full rec-48
ERROR: not full rec-62
ERROR: not found rec-143
ERROR: not full rec-148


In [134]:
print(f'tp: {tp}, fp: {fp}, fn: {fn}')
precision = tp / (tp + fp)
recall = tp / (tp + fn)
print(f'precision: {precision}')
print(f'recall: {recall}')
print(f'f1: {2 * precision * recall / (precision + recall)}')

tp: 500, fp: 0, fn: 31
precision: 1.0
recall: 0.9416195856873822
f1: 0.9699321047526672


In [154]:
def test_distances(data: List[Dict[Hashable, Any]], expected_res: Dict[str, Set[str]], similarity: Callable[[Dict[Hashable, Any], Dict[Hashable, Any]], float],threshold: float = 0.85) -> Tuple[int, int, int]:
  tp = 0
  fp = 0
  fn = 0
  res = []
  for i in range(len(data)):
    for j in range(i + 1, len(data)):
      sim = similarity(data[i], data[j])
      if sim > threshold:
        res.append((data[i]['rec_id'], data[j]['rec_id'], sim))
  res_by_ids = {}
  for (a_id, b_id, sim) in res:
    a_key = get_key(a_id)
    b_key = get_key(b_id)
    if a_key != b_key:
      print(f'ERROR: {a_id}, {b_id}, {sim}')
      fp += 1
      continue
    if a_key not in res_by_ids:
      res_by_ids[a_key] = set()
    res_by_ids[a_key].add(a_id)
    res_by_ids[a_key].add(b_id)
  for key in expected_res:
    if key not in res_by_ids:
      if len(expected_res[key]) > 1:
        print(f'ERROR: not found {key}')
        fn += len(expected_res[key])
      continue
    if len(expected_res[key]) != len(res_by_ids[key]):
      print(f'ERROR: not full {key}')
    fn += len(expected_res[key]) - len(res_by_ids[key])
    tp += len(res_by_ids[key])
  return tp, fp, fn

In [155]:
(tp, fp, fn) = test_distances(data_list, expected_res, get_lev_sim)

ERROR: not full rec-128
ERROR: not full rec-74
ERROR: not full rec-199
ERROR: not full rec-167
ERROR: not full rec-112
ERROR: not full rec-121
ERROR: not full rec-144
ERROR: not full rec-188
ERROR: not full rec-84
ERROR: not full rec-113
ERROR: not full rec-104
ERROR: not full rec-172
ERROR: not full rec-97
ERROR: not full rec-49
ERROR: not full rec-129
ERROR: not full rec-142
ERROR: not full rec-179
ERROR: not full rec-51
ERROR: not found rec-66
ERROR: not full rec-158
ERROR: not full rec-61
ERROR: not full rec-80
ERROR: not full rec-147
ERROR: not full rec-48
ERROR: not full rec-62
ERROR: not found rec-143
ERROR: not full rec-148


In [156]:
precision = tp / (tp + fp)
recall = tp / (tp + fn)
print(f'precision: {precision}')
print(f'recall: {recall}')
print(f'f1: {2 * precision * recall / (precision + recall)}')

precision: 1.0
recall: 0.9416195856873822
f1: 0.9699321047526672
