# Record Linkage

In [436]:
import recordlinkage
import pandas as pd
import collections
import time

In [437]:
PATH_DS_L = './Mediated Datasets/disfold_Avengers_m.jsonl'
PATH_DS_R = './Mediated Datasets/disfold_fr_m.jsonl'
THRESHOLD = 0.95

In [438]:
def get_features(df_l, df_r):
    feature_value = 0
    columns_l = df_l.columns.values.tolist()
    columns_r = df_r.columns.values.tolist()
    # set_index
    indexer = recordlinkage.Index()
    indexer.full()
    candidate_links = indexer.index(df_l, df_r)

    # Comparison step
    compare_cl = recordlinkage.Compare()
    compare_cl.string("name", "name", method="levenshtein", threshold=0.95, label="name")

    if 'ceo' in columns_l and 'ceo' in columns_r:
        compare_cl.string("ceo", "ceo", method="levenshtein", threshold=0.85, label="ceo")
        feature_value = 1

    if 'country' in columns_l and 'country' in columns_r:
        compare_cl.string("country", "country", method="levenshtein", threshold=0.85, label="country")
        feature_value = 1

    # compare_cl.exact("country", "country", label="country")

    return compare_cl.compute(candidate_links, df_l, df_r), feature_value


def get_pairs(features, feature_value):
    # Classification step
    pairs = features[features.sum(axis=1) > feature_value]
    return pairs['name'].keys().to_list()


def rename_columns(columns_l, columns_r):
    columns_l = [c + '_l' for c in columns_l]
    columns_r = [c + '_r' for c in columns_r]
    columns_join = columns_l + columns_r
    return columns_l, columns_r


def find_duplicated_columns(columns_l, columns_r):
    attr_cleaned = columns_l + columns_r
    duplicates = [item for item, count in collections.Counter(attr_cleaned).items() if count > 1]
    return duplicates

In [439]:
df_l = pd.read_json(PATH_DS_L, encoding='utf-8', lines=True, dtype=object)
df_l

Unnamed: 0,name,country,continent,founded,employees,ceo,market_cap,categories
0,apple,cupertino,ca,"april 1, 1976",100000,timothy d cook,2825000000000,"unitedstates, technology, consumerelectronics,..."
1,danaher,washington,dc,1969,78000,rainer m blair,208030000000,"unitedstates, healthcare, diagnostics&research..."
2,merck,kenilworth,nj,1891,67000,robert m davis jd,210380000000,"unitedstates, healthcare, drugmanufacturersgen..."
3,bhp,melbourne,vic,1885,40110,mike p henry bsc bsc chem,a$281.66 billion,"australia, basicmaterials, otherindustrialmeta..."
4,nike,beaverton,or,"january 25, 1964",73300,john j donahoe ii,213600000000,"unitedstates, consumerdiscretionary, footwear&..."
...,...,...,...,...,...,...,...,...
995,asahi group holdings ltd,tokyo,japan,,30020,atsushi katsuki,2.231 trillion,"japan, consumerstaples, beveragesbrewers, japa..."
996,haitong securities co ltd,shanghai,china,,11282,,cn115.98 billion,"china, financials, capitalmarkets, chinafinanc..."
997,catalent inc,somerset,nj,2007,17300,john r chiminski,18240000000,"unitedstates, healthcare, drugmanufacturersspe..."
998,quanta services inc,houston,tx,1997,43700,earl c austin jr,18250000000,"unitedstates, industrials, engineering&constru..."


In [440]:
df_r = pd.read_json(PATH_DS_R, encoding='utf-8', lines=True, dtype=object)
df_r

Unnamed: 0,founded,employees,ceo,name
0,"april 1, 1976",100000,timothy d cook,apple
1,"april 4, 1975",181000,satya nadella,microsoft
2,,,amin h nasser,aramco
3,"october 2, 2015",156500,sundar pichai,alphabet
4,"july 5, 1994",1608000,andrew r jassy,amazon
...,...,...,...,...
894,1930,20300,thomas sinnickson gayner,markel corporation
895,1939,12023,j powell brown cpcu cpcu,brown & brown inc
896,,16196,shingo konomoto,nomura research institute ltd
897,1986,52000,william joseph hornbuckle iv,mgm resorts international


In [441]:
features, feature_value = get_features(df_l, df_r)
pairs = get_pairs(features, feature_value)



In [442]:
features.sum(axis=1).value_counts().sort_index(ascending=False)

2.0       728
1.0       184
0.0    898088
dtype: int64

In [443]:
left = []
right = []

for elem in pairs:
    left.append(df_l.loc[elem[0]])
    right.append(df_r.loc[elem[1]])

matching_dataset_left = pd.DataFrame(left)
matching_dataset_right = pd.DataFrame(right)
matching_dataset_left = matching_dataset_left.reset_index(drop=True)
matching_dataset_right = matching_dataset_right.reset_index(drop=True)

In [444]:
matching_dataset_left

Unnamed: 0,name,country,continent,founded,employees,ceo,market_cap,categories
0,apple,cupertino,ca,"april 1, 1976",100000,timothy d cook,2825000000000,"unitedstates, technology, consumerelectronics,..."
1,danaher,washington,dc,1969,78000,rainer m blair,208030000000,"unitedstates, healthcare, diagnostics&research..."
2,merck,kenilworth,nj,1891,67000,robert m davis jd,210380000000,"unitedstates, healthcare, drugmanufacturersgen..."
3,bhp,melbourne,vic,1885,40110,mike p henry bsc bsc chem,a$281.66 billion,"australia, basicmaterials, otherindustrialmeta..."
4,nike,beaverton,or,"january 25, 1964",73300,john j donahoe ii,213600000000,"unitedstates, consumerdiscretionary, footwear&..."
...,...,...,...,...,...,...,...,...
723,rede d'or sao luiz sa,saopaulo,sp,,,paulo junqueira moll,r$99.71 billion,"brazil, healthcare, medicalcarefacilities, bra..."
724,teck resources limited,vancouver,bc,,10600,donald r lindsay bsc honours mba,c$26.81 billion,"canada, basicmaterials, otherindustrialmetals&..."
725,the saudi british bank,riyadh,saudiarabia,,4156,anthony william cripps,sr80.45 billion,"saudiarabia, financials, banks, saudiarabiafin..."
726,mitsui fudosan co ltd,tokyo,japan,,23992,masanobu komoda,2.629 trillion,"japan, realestate, realestatediversified, japa..."


In [445]:
matching_dataset_right

Unnamed: 0,founded,employees,ceo,name
0,"april 1, 1976",100000,timothy d cook,apple
1,1969,78000,rainer m blair,danaher
2,1891,67000,robert m davis jd,merck
3,1885,40110,mike p henry bsc bsc chem,bhp
4,"january 25, 1964",73300,john j donahoe ii,nike
...,...,...,...,...
723,,,paulo junqueira moll,rede d'or sao luiz sa
724,,10600,donald r lindsay bsc honours mba,teck resources limited
725,,4156,anthony william cripps,the saudi british bank
726,,23992,masanobu komoda,mitsui fudosan co ltd


In [446]:
difference_l = pd.concat([df_l, matching_dataset_left]).drop_duplicates(keep=False)
difference_r = pd.concat([df_r, matching_dataset_right]).drop_duplicates(keep=False)

In [447]:
difference_l

Unnamed: 0,name,country,continent,founded,employees,ceo,market_cap,categories
15,icbc,beijing,china,,439787,,cn1.593 trillion,"china, financials, banks, chinafinancials, chi..."
27,kweichow moutai,renhuai,china,,29031,,cn2.216 trillion,"china, consumerstaples, beverageswineries&dist..."
57,agricultural bank of china,beijing,china,,459000,,cn1.050 trillion,"china, financials, banks, chinafinancials, chi..."
66,catl,ningde,china,,33078,,cn1.181 trillion,"china, industrials, electricalequipment&parts,..."
69,china construction bank,beijing,china,,373814,,cn1.205 trillion,"china, financials, banks, chinafinancials, chi..."
...,...,...,...,...,...,...,...,...
995,asahi group holdings ltd,tokyo,japan,,30020,atsushi katsuki,2.231 trillion,"japan, consumerstaples, beveragesbrewers, japa..."
996,haitong securities co ltd,shanghai,china,,11282,,cn115.98 billion,"china, financials, capitalmarkets, chinafinanc..."
997,catalent inc,somerset,nj,2007,17300,john r chiminski,18240000000,"unitedstates, healthcare, drugmanufacturersspe..."
998,quanta services inc,houston,tx,1997,43700,earl c austin jr,18250000000,"unitedstates, industrials, engineering&constru..."


In [448]:
difference_r

Unnamed: 0,founded,employees,ceo,name
23,,29031,,kweichow moutai
35,,439787,,icbc
51,march 1996,104323,,novartis
58,,373814,,china construction bank
61,,33078,,catl
...,...,...,...,...
877,"as chartered bank of india, australia, and c...",81770,,standard chartered plc
888,,3280,,alinma bank
891,,15198,,guotai junan securities co ltd
892,,27318,,nippon paint holdings co ltd


In [431]:
column_left, column_right = rename_columns(matching_dataset_left.columns.values.tolist(),
                                           matching_dataset_right.columns.values.tolist())
matching_dataset_left.columns = column_left
matching_dataset_right.columns = column_right

In [432]:
joined_df = pd.concat([matching_dataset_left, matching_dataset_right], axis=1)
joined_df

Unnamed: 0,name_l,country_l,continent_l,founded_l,employees_l,ceo_l,market_cap_l,categories_l,founded_r,employees_r,ceo_r,name_r
0,apple,cupertino,ca,"april 1, 1976",100000,timothy d cook,2825000000000,"unitedstates, technology, consumerelectronics,...","april 1, 1976",100000,timothy d cook,apple
1,danaher,washington,dc,1969,78000,rainer m blair,208030000000,"unitedstates, healthcare, diagnostics&research...",1969,78000,rainer m blair,danaher
2,merck,kenilworth,nj,1891,67000,robert m davis jd,210380000000,"unitedstates, healthcare, drugmanufacturersgen...",1891,67000,robert m davis jd,merck
3,bhp,melbourne,vic,1885,40110,mike p henry bsc bsc chem,a$281.66 billion,"australia, basicmaterials, otherindustrialmeta...",1885,40110,mike p henry bsc bsc chem,bhp
4,nike,beaverton,or,"january 25, 1964",73300,john j donahoe ii,213600000000,"unitedstates, consumerdiscretionary, footwear&...","january 25, 1964",73300,john j donahoe ii,nike
...,...,...,...,...,...,...,...,...,...,...,...,...
647,rede d'or sao luiz sa,saopaulo,sp,,,paulo junqueira moll,r$99.71 billion,"brazil, healthcare, medicalcarefacilities, bra...",,,paulo junqueira moll,rede d'or sao luiz sa
648,teck resources limited,vancouver,bc,,10600,donald r lindsay bsc honours mba,c$26.81 billion,"canada, basicmaterials, otherindustrialmetals&...",,10600,donald r lindsay bsc honours mba,teck resources limited
649,the saudi british bank,riyadh,saudiarabia,,4156,anthony william cripps,sr80.45 billion,"saudiarabia, financials, banks, saudiarabiafin...",,4156,anthony william cripps,the saudi british bank
650,mitsui fudosan co ltd,tokyo,japan,,23992,masanobu komoda,2.629 trillion,"japan, realestate, realestatediversified, japa...",,23992,masanobu komoda,mitsui fudosan co ltd


In [433]:
duplicates = find_duplicated_columns(df_l.columns.values.tolist(), df_r.columns.values.tolist())
duplicates

['name', 'founded', 'employees', 'ceo']

In [434]:
for col in duplicates:
    candidates = joined_df[[col + '_l', col + '_r']].sample(20)
    candidate_labels = candidates.columns.values.tolist()
    display(candidates.sample(10))
    time.sleep(1)
    idx_drop = int(input("Inserisci l'indice (0 o 1) della colonna che vuoi scartare nel dataset finale: "))
    joined_df.drop(candidate_labels[idx_drop], axis=1, inplace=True)


Unnamed: 0,name_l,name_r
515,delta air lines inc,delta air lines inc
126,hca healthcare inc,hca healthcare inc
304,baxter international inc,baxter international inc
439,amerisourcebergen corporation,amerisourcebergen corporation
231,marvell technology group ltd,marvell technology group ltd
376,z holdings corp,z holdings corp
629,marubeni corp,marubeni corp
501,caixabank sa,caixabank sa
502,on semiconductor corporation,on semiconductor corporation
56,china merchants bank,china merchants bank


Unnamed: 0,founded_l,founded_r
49,1994,1994
471,"august 27, 1906","august 27, 1906"
332,"november 17, 1955","november 17, 1955"
433,17 november 1690,17 november 1690
529,,
305,2000,2000
100,december 2000,december 2000
63,6 april 1999,6 april 1999
339,1996,1996
278,1999,1999


Unnamed: 0,employees_l,employees_r
617,21000,21000
590,1254,1254
624,2349,2349
120,109700,109700
89,54365,54365
15,47792,47792
588,3000,3000
4,73300,73300
146,22604,22604
178,45000,45000


Unnamed: 0,ceo_l,ceo_r
387,bradley william corson bs,bradley william corson bs
537,helena hedblom msc,helena hedblom msc
494,lawrence e kurzius,lawrence e kurzius
436,tobias martinez gimeno,tobias martinez gimeno
96,william r mcdermott,william r mcdermott
6,hans e vestberg,hans e vestberg
324,charles victor magro bsc chem mba,charles victor magro bsc chem mba
533,christopher jerome swift cpa,christopher jerome swift cpa
281,igor y zhilkin,igor y zhilkin
302,carlo messina,carlo messina


In [435]:
joined_df.columns = [col[:-2] for col in joined_df.columns.values.tolist()]
joined_df

Unnamed: 0,country,continent,market_cap,categories,founded,employees,ceo,name
0,cupertino,ca,2825000000000,"unitedstates, technology, consumerelectronics,...","april 1, 1976",100000,timothy d cook,apple
1,washington,dc,208030000000,"unitedstates, healthcare, diagnostics&research...",1969,78000,rainer m blair,danaher
2,kenilworth,nj,210380000000,"unitedstates, healthcare, drugmanufacturersgen...",1891,67000,robert m davis jd,merck
3,melbourne,vic,a$281.66 billion,"australia, basicmaterials, otherindustrialmeta...",1885,40110,mike p henry bsc bsc chem,bhp
4,beaverton,or,213600000000,"unitedstates, consumerdiscretionary, footwear&...","january 25, 1964",73300,john j donahoe ii,nike
...,...,...,...,...,...,...,...,...
647,saopaulo,sp,r$99.71 billion,"brazil, healthcare, medicalcarefacilities, bra...",,,paulo junqueira moll,rede d'or sao luiz sa
648,vancouver,bc,c$26.81 billion,"canada, basicmaterials, otherindustrialmetals&...",,10600,donald r lindsay bsc honours mba,teck resources limited
649,riyadh,saudiarabia,sr80.45 billion,"saudiarabia, financials, banks, saudiarabiafin...",,4156,anthony william cripps,the saudi british bank
650,tokyo,japan,2.629 trillion,"japan, realestate, realestatediversified, japa...",,23992,masanobu komoda,mitsui fudosan co ltd


problema con record che hanno valori vuoti