# Record Linkage

In [13]:
import os
import csv
import re
import pandas as pd
import numpy as np
import dedupe
from unidecode import unidecode

In [14]:
output_file = './Data Matching Output/data_matching_output'
settings_file = 'data_matching_learned_settings'
training_file = 'data_matching_training.json'
sources_path = './Mediated Datasets/'
record_linkage_path = './Record Linkage/'

In [15]:
def get_all_files(path_dir):
    res = []
    for path in os.listdir(path_dir):
        filename = os.path.join(path_dir, path)
        if os.path.isfile(filename) and filename != '.DS_Store':
            res.append(path)
    return res


def pre_process(column):
    column = unidecode(column)
    column = re.sub('\n', ' ', column)
    column = re.sub('-', '', column)
    column = re.sub('/', ' ', column)
    column = re.sub("'", '', column)
    column = re.sub(",", '', column)
    column = re.sub(":", ' ', column)
    column = re.sub('  +', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    if not column:
        column = None
    return column


def read_data(filename):
    data_d = {}

    with open(filename, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            clean_row = dict([(k, pre_process(v)) for (k, v) in row.items()])
            data_d[filename + str(i)] = dict(clean_row)

    return data_d


def merge_same_entities(df):
    unique = df[df['Cluster ID'].isnull()].reset_index(drop=True)
    duplicates = df[~df['Cluster ID'].isnull()].reset_index(drop=True)
    merged = duplicates.fillna('')\
        .groupby('Cluster ID')\
        .max().replace('', np.nan)\
        .reset_index()
    return pd.concat([merged, unique])\
        .drop(['Cluster ID', 'Link Score'], axis=1)\
        .reset_index(drop=True)

In [16]:
def get_all_files(path_dir):
    res = []
    for file in os.listdir(path_dir):
        path = os.path.join(path_dir, file)
        if os.path.isfile(path) and file != '.DS_Store':
            res.append(file)
    return res

files = get_all_files(sources_path)
left_file = sources_path + files.pop(0)
print('importing:', left_file)
data_1 = read_data(left_file)


for n, file in enumerate(files):
    print('################  START ITERATION ' + str(n+1) + '  ################')
    right_file = sources_path + file
    print('importing:', right_file)
    data_2 = read_data(right_file)

    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as sf:
            linker = dedupe.StaticRecordLink(sf)
    else:
        fields = [
            {'field': 'name', 'type': 'String', 'has_missing': True},
            {'field': 'ceo', 'type': 'String', 'has missing': True},
            {'field': 'country', 'type': 'String', 'has missing': True},
        ]
        linker = dedupe.RecordLink(fields)

        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file) as tf:
                linker.prepare_training(data_1,
                                        data_2,
                                        training_file=tf,
                                        sample_size=1000)
        else:
            linker.prepare_training(data_1, data_2, sample_size=1000)

        print('starting active labeling...')

        dedupe.console_label(linker)

        linker.train()

        with open(training_file, 'w') as tf:
            linker.write_training(tf)

        with open(settings_file, 'wb') as sf:
            linker.write_settings(sf)

    print('clustering...')
    linked_records = linker.join(data_1, data_2, 0.0)

    print('# duplicate sets', len(linked_records))

    cluster_membership = {}
    for cluster_id, (cluster, score) in enumerate(linked_records):
        for record_id in cluster:
            cluster_membership[record_id] = {'Cluster ID': cluster_id,
                                             'Link Score': score}

    with open(output_file + '_' + str(n) + '.csv', 'w') as f:
        header_unwritten = True

        for fileno, filename in enumerate((left_file, right_file)):
            with open(filename) as f_input:
                reader = csv.DictReader(f_input)

                if header_unwritten:
                    fieldnames = (['Cluster ID', 'Link Score'] +
                                  reader.fieldnames)

                    writer = csv.DictWriter(f, fieldnames=fieldnames)
                    writer.writeheader()

                    header_unwritten = False

                for row_id, row in enumerate(reader):
                    record_id = filename + str(row_id)
                    cluster_details = cluster_membership.get(record_id, {})
                    row.update(cluster_details)

                    writer.writerow(row)

    ### Mergio le informazioni dei record appartenenti alle stesse aziende
    df = pd.read_csv(output_file + '_' + str(n) + '.csv', encoding='utf-8', dtype=object)
    result = merge_same_entities(df)
    if n == (len(files)-1):
        left_file = record_linkage_path + 'final_result_merge.csv'
    else:
        left_file = record_linkage_path + 'result_merge' + '_' + str(n) + '.csv'
    result.to_csv(left_file, index=False)
    data_1 = read_data(left_file)
    print('################  END ITERATION ' + str(n+1) + '  ################')

36
importing: ./Mediated Datasets/valuetoday_fr_cleaned_mediated.csv
################  START ITERATION 1  ################
importing: ./Mediated Datasets/valueToday_iGMM_cleaned_mediated.csv
reading from data_matching_learned_settings
clustering...
# duplicate sets 999
################  END ITERATION 1  ################
