# Record Linkage

In [368]:
import recordlinkage
import pandas as pd
import collections
import time

In [356]:
PATH_DS_L = './Mediated Datasets/disfold_Avengers_m.jsonl'
PATH_DS_R = './Mediated Datasets/disfold_fr_m.jsonl'
THRESHOLD = 0.85

In [357]:
def get_features(df_l, df_r):
    feature_value = 0
    columns_l = df_l.columns.values.tolist()
    columns_r = df_r.columns.values.tolist()
    # set_index
    indexer = recordlinkage.Index()
    indexer.full()
    candidate_links = indexer.index(df_l, df_r)

    # Comparison step
    compare_cl = recordlinkage.Compare()
    compare_cl.string("name", "name", method="levenshtein", threshold=THRESHOLD, label="name")

    if 'ceo' in columns_l and 'ceo' in columns_r:
        compare_cl.string("ceo", "ceo", method="levenshtein", threshold=THRESHOLD, label="ceo")
        feature_value = 1

    if 'country' in columns_l and 'country' in columns_r:
        compare_cl.string("country", "country", method="levenshtein", threshold=THRESHOLD, label="country")
        feature_value = 1

    # compare_cl.exact("country", "country", label="country")

    return compare_cl.compute(candidate_links, df_l, df_r), feature_value


def get_pairs(features, feature_value):
    # Classification step
    pairs = features[features.sum(axis=1) > feature_value]
    return pairs['name'].keys().to_list()


def rename_columns(columns_l, columns_r):
    columns_l = [c + '_l' for c in columns_l]
    columns_r = [c + '_r' for c in columns_r]
    columns_join = columns_l + columns_r
    return columns_l, columns_r


def find_duplicated_columns(columns_l, columns_r):
    attr_cleaned = columns_l + columns_r
    duplicates = [item for item, count in collections.Counter(attr_cleaned).items() if count > 1]
    return duplicates

In [358]:
df_l = pd.read_json(PATH_DS_L, encoding='utf-8', lines=True, dtype=object)
df_l

Unnamed: 0,name,country,continent,founded,employees,ceo,market_cap,categories
0,apple,cupertino,ca,"april 1, 1976",100000,timothy d cook,2825000000000,"unitedstates, technology, consumerelectronics,..."
1,danaher,washington,dc,1969,78000,rainer m blair,208030000000,"unitedstates, healthcare, diagnostics&research..."
2,merck,kenilworth,nj,1891,67000,robert m davis jd,210380000000,"unitedstates, healthcare, drugmanufacturersgen..."
3,bhp,melbourne,vic,1885,40110,mike p henry bsc bsc chem,a$281.66 billion,"australia, basicmaterials, otherindustrialmeta..."
4,nike,beaverton,or,"january 25, 1964",73300,john j donahoe ii,213600000000,"unitedstates, consumerdiscretionary, footwear&..."
...,...,...,...,...,...,...,...,...
995,asahi group holdings ltd,tokyo,japan,,30020,atsushi katsuki,2.231 trillion,"japan, consumerstaples, beveragesbrewers, japa..."
996,haitong securities co ltd,shanghai,china,,11282,,cn115.98 billion,"china, financials, capitalmarkets, chinafinanc..."
997,catalent inc,somerset,nj,2007,17300,john r chiminski,18240000000,"unitedstates, healthcare, drugmanufacturersspe..."
998,quanta services inc,houston,tx,1997,43700,earl c austin jr,18250000000,"unitedstates, industrials, engineering&constru..."


In [359]:
df_r = pd.read_json(PATH_DS_R, encoding='utf-8', lines=True, dtype=object)
df_r.head(10)

Unnamed: 0,founded,employees,ceo,name
0,"april 1, 1976",100000.0,timothy d cook,apple
1,"april 4, 1975",181000.0,satya nadella,microsoft
2,,,amin h nasser,aramco
3,"october 2, 2015",156500.0,sundar pichai,alphabet
4,"july 5, 1994",1608000.0,andrew r jassy,amazon
5,"july 1, 2003",99290.0,elon r musk,tesla
6,1839,372000.0,warren e buffett,berkshire hathaway
7,"april 5, 1993",22473.0,jen-hsun huang,nvidia
8,"january 4, 2004",71970.0,mark elliot zuckerberg,meta
9,"industrial technology research institute, hs...",54193.0,c c wei,tsmc


In [360]:
features, feature_value = get_features(df_l, df_r)
pairs = get_pairs(features, feature_value)



In [361]:
features.sum(axis=1).value_counts().sort_index(ascending=False)

2.0       652
1.0       275
0.0    898073
dtype: int64

In [362]:
left = []
right = []

for elem in pairs:
    left.append(df_l.loc[elem[0]])
    right.append(df_r.loc[elem[1]])

matching_dataset_left = pd.DataFrame(left)
matching_dataset_right = pd.DataFrame(right)

column_left, column_right = rename_columns(matching_dataset_left.columns.values.tolist(),
                                           matching_dataset_right.columns.values.tolist())
matching_dataset_left.columns = column_left
matching_dataset_right.columns = column_right

matching_dataset_left = matching_dataset_left.reset_index(drop=True)
matching_dataset_right = matching_dataset_right.reset_index(drop=True)

In [363]:
matching_dataset_left

Unnamed: 0,name_l,country_l,continent_l,founded_l,employees_l,ceo_l,market_cap_l,categories_l
0,apple,cupertino,ca,"april 1, 1976",100000,timothy d cook,2825000000000,"unitedstates, technology, consumerelectronics,..."
1,danaher,washington,dc,1969,78000,rainer m blair,208030000000,"unitedstates, healthcare, diagnostics&research..."
2,merck,kenilworth,nj,1891,67000,robert m davis jd,210380000000,"unitedstates, healthcare, drugmanufacturersgen..."
3,bhp,melbourne,vic,1885,40110,mike p henry bsc bsc chem,a$281.66 billion,"australia, basicmaterials, otherindustrialmeta..."
4,nike,beaverton,or,"january 25, 1964",73300,john j donahoe ii,213600000000,"unitedstates, consumerdiscretionary, footwear&..."
...,...,...,...,...,...,...,...,...
647,rede d'or sao luiz sa,saopaulo,sp,,,paulo junqueira moll,r$99.71 billion,"brazil, healthcare, medicalcarefacilities, bra..."
648,teck resources limited,vancouver,bc,,10600,donald r lindsay bsc honours mba,c$26.81 billion,"canada, basicmaterials, otherindustrialmetals&..."
649,the saudi british bank,riyadh,saudiarabia,,4156,anthony william cripps,sr80.45 billion,"saudiarabia, financials, banks, saudiarabiafin..."
650,mitsui fudosan co ltd,tokyo,japan,,23992,masanobu komoda,2.629 trillion,"japan, realestate, realestatediversified, japa..."


In [364]:
matching_dataset_right

Unnamed: 0,founded_r,employees_r,ceo_r,name_r
0,"april 1, 1976",100000,timothy d cook,apple
1,1969,78000,rainer m blair,danaher
2,1891,67000,robert m davis jd,merck
3,1885,40110,mike p henry bsc bsc chem,bhp
4,"january 25, 1964",73300,john j donahoe ii,nike
...,...,...,...,...
647,,,paulo junqueira moll,rede d'or sao luiz sa
648,,10600,donald r lindsay bsc honours mba,teck resources limited
649,,4156,anthony william cripps,the saudi british bank
650,,23992,masanobu komoda,mitsui fudosan co ltd


In [365]:
joined_df = pd.concat([matching_dataset_left, matching_dataset_right], axis=1)
joined_df

Unnamed: 0,name_l,country_l,continent_l,founded_l,employees_l,ceo_l,market_cap_l,categories_l,founded_r,employees_r,ceo_r,name_r
0,apple,cupertino,ca,"april 1, 1976",100000,timothy d cook,2825000000000,"unitedstates, technology, consumerelectronics,...","april 1, 1976",100000,timothy d cook,apple
1,danaher,washington,dc,1969,78000,rainer m blair,208030000000,"unitedstates, healthcare, diagnostics&research...",1969,78000,rainer m blair,danaher
2,merck,kenilworth,nj,1891,67000,robert m davis jd,210380000000,"unitedstates, healthcare, drugmanufacturersgen...",1891,67000,robert m davis jd,merck
3,bhp,melbourne,vic,1885,40110,mike p henry bsc bsc chem,a$281.66 billion,"australia, basicmaterials, otherindustrialmeta...",1885,40110,mike p henry bsc bsc chem,bhp
4,nike,beaverton,or,"january 25, 1964",73300,john j donahoe ii,213600000000,"unitedstates, consumerdiscretionary, footwear&...","january 25, 1964",73300,john j donahoe ii,nike
...,...,...,...,...,...,...,...,...,...,...,...,...
647,rede d'or sao luiz sa,saopaulo,sp,,,paulo junqueira moll,r$99.71 billion,"brazil, healthcare, medicalcarefacilities, bra...",,,paulo junqueira moll,rede d'or sao luiz sa
648,teck resources limited,vancouver,bc,,10600,donald r lindsay bsc honours mba,c$26.81 billion,"canada, basicmaterials, otherindustrialmetals&...",,10600,donald r lindsay bsc honours mba,teck resources limited
649,the saudi british bank,riyadh,saudiarabia,,4156,anthony william cripps,sr80.45 billion,"saudiarabia, financials, banks, saudiarabiafin...",,4156,anthony william cripps,the saudi british bank
650,mitsui fudosan co ltd,tokyo,japan,,23992,masanobu komoda,2.629 trillion,"japan, realestate, realestatediversified, japa...",,23992,masanobu komoda,mitsui fudosan co ltd


In [366]:
duplicates = find_duplicated_columns(df_l.columns.values.tolist(), df_r.columns.values.tolist())
duplicates

['name', 'founded', 'employees', 'ceo']

In [370]:
for col in duplicates:
    candidates = joined_df[[col + '_l', col + '_r']].sample(20)
    candidate_labels = candidates.columns.values.tolist()
    display(candidates.sample(10))
    time.sleep(2)
    idx_drop = int(input("Inserisci l'indice (0 o 1) della colonna che vuoi scartare nel dataset finale: "))
    joined_df.drop(candidate_labels[idx_drop], axis=1, inplace=True)


Unnamed: 0,name_l,name_r
338,intuitive surgical inc,intuitive surgical inc
589,skyworks solutions inc,skyworks solutions inc
73,totalenergies,totalenergies
393,orange sa,orange sa
626,advanced info service pcl,advanced info service pcl
228,marvell technology inc,marvell technology inc
142,mondelez international inc,mondelez international inc
95,target corporation,target corporation
397,rockwell automation inc,rockwell automation inc
24,home depot,home depot


Unnamed: 0,founded_l,founded_r
540,,
567,,
238,,
133,1848,1848
138,"june 13, 1902","june 13, 1902"
496,1927,1927
585,"cincinnati, ohio, u.s 1950","cincinnati, ohio, u.s 1950"
427,1906,1906
612,"august 29, 1856","august 29, 1856"
634,2000,2000


Unnamed: 0,employees_l,employees_r
100,90096.0,90096.0
553,124089.0,124089.0
576,1212.0,1212.0
558,9095.0,9095.0
42,,
155,14400.0,14400.0
94,292067.0,292067.0
125,32741.0,32741.0
5,113000.0,113000.0
433,81600.0,81600.0


Unnamed: 0,ceo_l,ceo_r
123,francesco milleri,francesco milleri
315,alan s armstrong,alan s armstrong
533,christopher jerome swift cpa,christopher jerome swift cpa
421,urs schaeppi lic oec hsg,urs schaeppi lic oec hsg
88,kevin a lobo,kevin a lobo
167,toshio kagami,toshio kagami
346,kevin m stein phd,kevin m stein phd
18,david a ricks,david a ricks
17,hock e tan,hock e tan
280,brian humphries,brian humphries


In [374]:
joined_df.columns = [col[:-2] for col in joined_df.columns.values.tolist()]
joined_df

Unnamed: 0,country,continent,market_cap,categories,founded,employees,ceo,name
0,cupertino,ca,2825000000000,"unitedstates, technology, consumerelectronics,...","april 1, 1976",100000,timothy d cook,apple
1,washington,dc,208030000000,"unitedstates, healthcare, diagnostics&research...",1969,78000,rainer m blair,danaher
2,kenilworth,nj,210380000000,"unitedstates, healthcare, drugmanufacturersgen...",1891,67000,robert m davis jd,merck
3,melbourne,vic,a$281.66 billion,"australia, basicmaterials, otherindustrialmeta...",1885,40110,mike p henry bsc bsc chem,bhp
4,beaverton,or,213600000000,"unitedstates, consumerdiscretionary, footwear&...","january 25, 1964",73300,john j donahoe ii,nike
...,...,...,...,...,...,...,...,...
647,saopaulo,sp,r$99.71 billion,"brazil, healthcare, medicalcarefacilities, bra...",,,paulo junqueira moll,rede d'or sao luiz sa
648,vancouver,bc,c$26.81 billion,"canada, basicmaterials, otherindustrialmetals&...",,10600,donald r lindsay bsc honours mba,teck resources limited
649,riyadh,saudiarabia,sr80.45 billion,"saudiarabia, financials, banks, saudiarabiafin...",,4156,anthony william cripps,the saudi british bank
650,tokyo,japan,2.629 trillion,"japan, realestate, realestatediversified, japa...",,23992,masanobu komoda,mitsui fudosan co ltd
