In [1]:
import os
import json
import csv
import re
import logging
import optparse

import dedupe
from unidecode import unidecode

In [2]:
output_file = 'data_matching_output.json'
settings_file = 'data_matching_learned_settings'
training_file = 'data_matching_training.json'
left_file = './Mediated Datasets/disfold_DeBiGa_m.jsonl'
right_file = './Mediated Datasets/CompaniesMarketCap_GioPonSpiz_m.jsonl'

In [3]:
def readData(filename):
    data_d = {}

    with open(filename, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if line != '\n':
                data = json.loads(line)
                data_d[filename + str(i)] = data

    return data_d

In [4]:
print('importing data ...')
data_1 = readData(left_file)
data_2 = readData(right_file)

importing data ...


In [None]:
if os.path.exists(settings_file):
    print('reading from', settings_file)
    with open(settings_file, 'rb') as sf:
        linker = dedupe.StaticRecordLink(sf)
else:
    fields = [
        {'field': 'name', 'type': 'String'},
        {'field': 'ceo', 'type': 'String', 'has missing': True}
    ]
    linker = dedupe.RecordLink(fields)

    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file) as tf:
            linker.prepare_training(data_1,
                                    data_2,
                                    training_file=tf,
                                    sample_size=15)
    else:
        linker.prepare_training(data_1, data_2, sample_size=15)

    print('starting active labeling...')

    dedupe.console_label(linker)

    linker.train()

    with open(training_file, 'w') as tf:
            linker.write_training(tf)

    with open(settings_file, 'wb') as sf:
            linker.write_settings(sf)

print('clustering...')
linked_records = linker.join(data_1, data_2, 0.0)

print('# duplicate sets', len(linked_records))

name : agricultural bank of china ltd
ceo : None

name : agricultural bank of china
ceo : 

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


starting active labeling...
y


name : central japan railway co
ceo : None

name : central japan railway
ceo : 

1/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : m3 inc
ceo : None

name : m3 inc
ceo : 

2/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : smc corp
ceo : None

name : smc corp
ceo : 

3/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : alinma bank
ceo : None

name : alinma bank
ceo : 

4/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : constellation software inc
ceo : None

name : constellation software
ceo : 

5/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : formosa petrochemical corp
ceo : None

name : formosa petrochemical
ceo : 

6/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : evergreen marine corp
ceo : None

name : evergreen marine
ceo : 

7/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : astra international tbk pt
ceo : None

name : astra international
ceo : 

8/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : ecopetrol sa
ceo : None

name : ecopetrol
ceo : 

9/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : prudential plc
ceo : None

name : prudential
ceo : 

10/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : novartis ag
ceo : None

name : novartis
ceo : 

11/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : china railway group ltd
ceo : None

name : china railway group
ceo : 

12/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : china merchants securities co ltd
ceo : None

name : china merchants securities
ceo : 

13/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : shin-etsu chemical co ltd
ceo : None

name : shin-etsu chemical
ceo : 

14/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : evolution gaming group ab
ceo : None

name : evolution gaming
ceo : 

15/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : aviva plc
ceo : None

name : aviva
ceo : 

16/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : vale sa
ceo : None

name : vale
ceo : 

17/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : al rajhi banking and investment corp
ceo : None

name : al rajhi bank
ceo : 

18/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : aeon co ltd
ceo : None

name : aeon
ceo : 

19/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : united microelectronics corporation
ceo : None

name : united
ceo : 

20/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : the people's insurance company group of china ltd
ceo : None

name : the9
ceo : 

20/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : international paper company
ceo : None

name : intel
ceo : 

20/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : evergreen marine corp
ceo : None

name : evergy
ceo : 

20/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : poly developments and holdings group co ltd
ceo : None

name : city developments
ceo : 

20/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : amazoncom inc
ceo : andrew r jassy

name : amazon
ceo : 

20/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : ingersoll rand inc
ceo : vicente reynal

name : ing
ceo : 

21/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : sumitomo mitsui financial group inc
ceo : jun ohta

name : gumi
ceo : 

21/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : canadian natural resources limited
ceo : None

name : canon
ceo : 

21/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : hennes & mauritz ab
ceo : None

name : hp
ceo : 

21/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : longi green energy technology co ltd
ceo : None

name : adani green energy
ceo : 

21/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : blackrock inc
ceo : laurence douglas fink

name : blackrock
ceo : 

21/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : enterprise products partners lp
ceo : a james teague

name : enterprise products
ceo : 

22/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : fifth third bancorp
ceo : gregory d carmichael

name : fifth third bank
ceo : 

23/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : schneider electric se
ceo : jean-pascal tricoire

name : schneider electric infrastructure
ceo : 

24/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : ameren corporation
ceo : martin j lyons jr

name : amarin corporation
ceo : 

25/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : petroleo brasileiro sa - petrobras
ceo : joaquim silva e luna

name : petco
ceo : 

26/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : electricite de france sa
ceo : jean-bernard levy

name : electrica
ceo : 

26/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : simon property group inc
ceo : david e simon

name : simona
ceo : 

26/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : vestas wind systea-s
ceo : henrik andersen

name : vestas wind systems
ceo : 

26/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : cummins inc
ceo : n thomas linebarger

name : cummins india
ceo : 

27/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : philip morris international inc
ceo : jacek olczak

name : philip morris cr
ceo : 

27/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling


clustering...


In [7]:
linker

<dedupe.api.RecordLink at 0x7fd168d4ab90>

In [2]:
def preProcess(column):
    #
    column = unidecode(column)
    column = re.sub('\n', ' ', column)
    column = re.sub('-', '', column)
    column = re.sub('/', ' ', column)
    column = re.sub("'", '', column)
    column = re.sub(",", '', column)
    column = re.sub(":", ' ', column)
    column = re.sub('  +', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    if not column:
        column = None
    return column


def readData2(filename):
    #
    data_d = {}

    with open(filename) as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            data_d[filename + str(i)] = dict(row)

    return data_d


readData2('./Datasets/cbinsight/cbinsights_DDD.csv')

KeyboardInterrupt: 