In [163]:
import recordlinkage as rl
import pandas as pd
import os
import random as ran
from IPython.display import clear_output


In [116]:
schema_path = "./csv/schema_final.csv"

In [117]:
schema = pd.read_csv(schema_path, index_col=0)
schema = schema.reset_index(drop=True)

# Preprocessamento

In [118]:
from recordlinkage.preprocessing import *

## Cleaning dei dati ...

In [119]:
schema_cleaned = schema

In [120]:
for i in range(len(schema.index)):
    row = schema.iloc[i]
    row_series = row.squeeze()
    nome = pd.Series(row["name"])
    try:
        nome_cleaned = clean(nome, lowercase=True, replace_by_none='', replace_by_whitespace='', strip_accents='unicode', remove_brackets=False, encoding='utf-8', decode_error='strict')
    except:
        print(nome)
    row_series_cleaned = clean(row_series, lowercase=True, replace_by_none=r'[^ \\.\\-\\_A-Za-z0-9$€£¥₩₽₹฿₪¤¢₺₱]+', replace_by_whitespace=r'[\\-\\_]', strip_accents=None, remove_brackets=False, encoding='utf-8', decode_error='strict')
    schema_cleaned.iloc[i] = row_series_cleaned
    schema_cleaned.iloc[i]["name"] = nome_cleaned[0]
print(schema_cleaned)

                                            name        country market cap  \
0                             berkshire hathaway  united states        NaN   
1                                           icbc          china        NaN   
2       saudi arabian oil company (saudi aramco)   saudi arabia        NaN   
3                                 jpmorgan chase  united states        NaN   
4                        china construction bank          china        NaN   
...                                          ...            ...        ...   
188552                                   gazprom         russia        NaN   
188553                                   rosseti         russia        NaN   
188554                                 nornickel         russia        NaN   
188555                                 severstal         russia        NaN   
188556                                      ozon            usa        NaN   

       founded year employees                        industry s

# Occorrenze token per riga

In [121]:
# value_occurence(schema)

# Blocking

In [135]:
indexer = rl.Index()
indexer.block(left_on='name', right_on='name')
candidate_links = indexer.index(schema_cleaned)
print((candidate_links))

MultiIndex([(  2966,      0),
            ( 12966,      0),
            ( 12966,   2966),
            ( 30538,      0),
            ( 30538,   2966),
            ( 30538,  12966),
            ( 34398,      0),
            ( 34398,   2966),
            ( 34398,  12966),
            ( 34398,  30538),
            ...
            (176795, 176576),
            (176851, 176580),
            (176729, 176591),
            (176697, 176602),
            (176751, 176606),
            (177007, 176616),
            (176863, 176642),
            (176934, 176671),
            (176984, 176796),
            (177004, 176877)],
           length=549188)


# Labeling

In [184]:
n_match = 2
link_idxs = set()
labeled_links = set()
clear = lambda: os.system('clear')
choices = ran.choices(candidate_links, k=n_match)
for choice in choices:
    print("I due record appartengono alla stessa entità?")
    print("y -> sì;  n -> no;  q -> esci")
    print((schema_cleaned.iloc[[choice[0], choice[1]]]))
    valido = False
    stop = False
    while(not valido):
        inp = input()
        if(inp == 'q'):
            stop = True
            break
        if(inp == 'y'):
            valido = True
            link_idxs.add(choice)
            labeled_links.add(choice + (1,))
        elif(inp == 'n'):
            valido = True
            labeled_links.add(choice + (1.))
    if(stop):
        break
    clear_output()

golden_path = "./csv/golden_links.csv"
golden_df = pd.read_csv(golden_path, index_col=False)
new_golden_df = pd.DataFrame(link_idxs, columns=['id_1', 'id_2'])
pd.concat([golden_df, new_golden_df]).to_csv(golden_path)

labeled_path = "./csv/labeled_links.csv"
labeled_df = pd.read_csv(labeled_path, index_col=False)
new_labeled_df = pd.DataFrame(labeled_links, columns=['id_1', 'id_2', 'match'])
labeled_df = pd.concat([labeled_links, new_labeled_df])
new_labeled_df.to_csv(labeled_path)

In [123]:
print(len(candidate_links)/len(schema_cleaned)**2)

1.544670027008048e-05


# Pre-Labeling

In [124]:
['name', 'country', 'market cap', 'founded year', 'employees', 'industry', 'sector',
     'ceo', 'revenue', 'Stock', 'share price', 'city', 'address', 'website']

['name',
 'country',
 'market cap',
 'founded year',
 'employees',
 'industry',
 'sector',
 'ceo',
 'revenue',
 'Stock',
 'share price',
 'city',
 'address',
 'website']

In [125]:
compare_cl = rl.Compare()
compare_cl.string("name", "name", threshold=0.85, label="name")
compare_cl.string("country", "country", label="country")
compare_cl.string("founded year", "founded year", label="founded year")
compare_cl.string("industry", "industry", label="industry")
compare_cl.string("sector", "sector", label="sector")
compare_cl.string("address", "address", label="address")
compare_cl.string("city", "city", label="city")
compare_cl.string("ceo", "ceo", label="ceo")
features = compare_cl.compute(candidate_links, schema, schema)

In [126]:
features

Unnamed: 0,Unnamed: 1,name,country,founded year,industry,sector,address,city,ceo
2966,0,1.0,0.230769,0.750000,0.000000,0.0,0.0,0.0,0.666667
12966,0,1.0,0.000000,0.250000,0.000000,0.0,0.0,0.0,0.480000
12966,2966,1.0,0.000000,0.333333,0.000000,0.0,0.0,0.0,0.560000
30538,0,1.0,1.000000,0.750000,0.173913,0.0,0.0,0.0,0.571429
30538,2966,1.0,0.230769,1.000000,0.000000,0.0,0.0,0.0,0.666667
...,...,...,...,...,...,...,...,...,...
177007,176616,1.0,1.000000,1.000000,1.000000,0.0,0.0,0.0,0.000000
176863,176642,1.0,1.000000,0.500000,1.000000,0.0,0.0,0.0,0.000000
176934,176671,1.0,1.000000,1.000000,1.000000,0.0,0.0,0.0,0.000000
176984,176796,1.0,1.000000,1.000000,1.000000,0.0,0.0,0.0,0.000000


In [127]:
features.describe()

Unnamed: 0,name,country,founded year,industry,sector,address,city,ceo
count,549188.0,549188.0,549188.0,549188.0,549188.0,549188.0,549188.0,549188.0
mean,1.0,0.428896,0.043542,0.032894,0.042351,0.011631,0.004851,0.049363
std,0.0,0.48779,0.195757,0.173933,0.200951,0.088958,0.06945,0.203521
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [128]:
features.sum(axis=1).value_counts().sort_index(ascending=False)

6.000000       315
5.972222         1
5.966102         1
5.961538         1
5.956522         1
             ...  
1.023810         1
1.021898         2
1.019231         3
1.017857         1
1.000000    247199
Length: 3342, dtype: int64

In [129]:
record_linked = features[features.sum(axis=1) > 3]

In [130]:
print(record_linked)

               name  country  founded year  industry  sector  address  city  \
30538  0        1.0      1.0      0.750000  0.173913     0.0      0.0   0.0   
178886 2966     1.0      1.0      0.666667  0.000000     0.0      0.0   0.0   
2995   1        1.0      1.0      1.000000  0.000000     0.0      0.0   0.0   
2965   2        1.0      1.0      1.000000  0.000000     0.0      0.0   0.0   
178881 2        1.0      1.0      0.666667  0.000000     0.0      0.0   0.0   
...             ...      ...           ...       ...     ...      ...   ...   
177007 176616   1.0      1.0      1.000000  1.000000     0.0      0.0   0.0   
176863 176642   1.0      1.0      0.500000  1.000000     0.0      0.0   0.0   
176934 176671   1.0      1.0      1.000000  1.000000     0.0      0.0   0.0   
176984 176796   1.0      1.0      1.000000  1.000000     0.0      0.0   0.0   
177004 176877   1.0      1.0      1.000000  1.000000     0.0      0.0   0.0   

                    ceo  
30538  0       0.571429  

# Classification

In [134]:
ECM = rl.ECMClassifier(binarize=0.6)
record_linked.index.names = (['id1', 'id2'])
print(len(record_linked.index))
result = ECM.fit_predict(record_linked)
print(len(result))

23060
21561


  feature_log_prob_ = np.log(feature_prob_)
