# Record Linkage

In [2]:
import recordlinkage
import pandas as pd
import collections
import time

In [3]:
PATH_DS_L = './Mediated Datasets/cbinsights_DDD_m.jsonl'
PATH_DS_R = './Mediated Datasets/disfold_fr_m.jsonl'

In [4]:
df_l = pd.read_json(PATH_DS_L, encoding='utf-8', lines=True, dtype=object)
df_l

Unnamed: 0,name,country,industry,founded
0,lacework,united states,cybersecurity,2015
1,tipalti,united states,fintech,2010
2,tempus,united states,health,2015
3,anduril,united states,artificial intelligence,2017
4,bolt,estonia,auto & transportation,2013
...,...,...,...,...
1180,spacex,united states,other,2002
1181,fanatics,united states,e-commerce & direct-to-consumer,1995
1182,instacart,united states,"supply chain, logistics, & delivery",2012
1183,databricks,united states,data management & analytics,2013


In [5]:
df_r = pd.read_json(PATH_DS_R, encoding='utf-8', lines=True, dtype=object)
df_r

Unnamed: 0,founded,employees,ceo,name
0,"april 1, 1976",100000,timothy d cook,apple
1,"april 4, 1975",181000,satya nadella,microsoft
2,,,amin h nasser,aramco
3,"october 2, 2015",156500,sundar pichai,alphabet
4,"july 5, 1994",1608000,andrew r jassy,amazon
...,...,...,...,...
894,1930,20300,thomas sinnickson gayner,markel corporation
895,1939,12023,j powell brown cpcu cpcu,brown & brown inc
896,,16196,shingo konomoto,nomura research institute ltd
897,1986,52000,william joseph hornbuckle iv,mgm resorts international


In [6]:
def get_features(df_l, df_r, name_threshold, is_name):
    feature_value = 0
    columns_l = df_l.columns.values.tolist()
    columns_r = df_r.columns.values.tolist()
    # set_index
    indexer = recordlinkage.Index()
    indexer.full()
    candidate_links = indexer.index(df_l, df_r)

    # Comparison step
    compare_cl = recordlinkage.Compare()
    compare_cl.string("name", "name", method="levenshtein", threshold=name_threshold, label="name")

    if not is_name:
        if 'ceo' in columns_l and 'ceo' in columns_r:
            compare_cl.string("ceo", "ceo", method="levenshtein", threshold=0.85, label="ceo")
            feature_value = 1

    # compare_cl.exact("country", "country", label="country")

    return compare_cl.compute(candidate_links, df_l, df_r), feature_value


def get_pairs(features, feature_value):
    # Classification step
    pairs = features[features.sum(axis=1) > feature_value]
    return pairs['name'].keys().to_list()


def rename_columns(columns_l, columns_r):
    columns_l = [c + '_l' for c in columns_l]
    columns_r = [c + '_r' for c in columns_r]
    columns_join = columns_l + columns_r
    return columns_l, columns_r

def remove_suffix(columns):
    return [c[:-2] for c in columns]


def find_duplicated_columns(columns_l, columns_r):
    attr_cleaned = columns_l + columns_r
    duplicates = [item for item, count in collections.Counter(attr_cleaned).items() if count > 1]
    return duplicates

# def join(matching_dataset_left, matching_dataset_right):
#     column_left, column_right = rename_columns(matching_dataset_left.columns.values.tolist(),
#                                            matching_dataset_right.columns.values.tolist())
#     matching_dataset_left.columns = column_left
#     matching_dataset_right.columns = column_right
#     joined_df = pd.concat([matching_dataset_left, matching_dataset_right], axis=1)
#
#     duplicates = find_duplicated_columns(df_l.columns.values.tolist(), df_r.columns.values.tolist())
#     for col in duplicates:
#         candidates = joined_df[[col + '_l', col + '_r']].sample(20)
#         candidate_labels = candidates.columns.values.tolist()
#         display(candidates.sample(10))
#         time.sleep(1)
#         idx_drop = int(input("Inserisci l'indice (0 o 1) della colonna che vuoi scartare nel dataset finale: "))
#         joined_df.drop(candidate_labels[idx_drop], axis=1, inplace=True)
#
#     joined_df.columns = [col[:-2] for col in joined_df.columns.values.tolist()]
#     return joined_df

def join(matching_dataset_left, matching_dataset_right):
    duplicates = find_duplicated_columns(matching_dataset_left.columns.values.tolist(), matching_dataset_right.columns.values.tolist())

    column_left, column_right = rename_columns(matching_dataset_left.columns.values.tolist(),
                                           matching_dataset_right.columns.values.tolist())
    matching_dataset_left.columns = column_left
    matching_dataset_right.columns = column_right
    joined_df = pd.concat([matching_dataset_left, matching_dataset_right], axis=1)

    # duplicates = find_duplicated_columns(matching_dataset_left.columns.values.tolist(), matching_dataset_right.columns.values.tolist())
    for col in duplicates:
        candidates = joined_df[[col + '_l', col + '_r']].sample(20)
        candidate_labels = candidates.columns.values.tolist()
        display(candidates.sample(10))
        time.sleep(1)
        idx_drop = int(input("Inserisci l'indice (0 o 1) della colonna che vuoi scartare nel dataset finale: "))
        joined_df.drop(candidate_labels[idx_drop], axis=1, inplace=True)

    joined_df.columns = [col[:-2] for col in joined_df.columns.values.tolist()]
    return joined_df

features, feature_value = get_features(df_l, df_r, 0.95, False)
pairs = get_pairs(features, feature_value)
features.sum(axis=1).value_counts().sort_index(ascending=False)

left = []
right = []

for elem in pairs:
    left.append(df_l.loc[elem[0]])
    right.append(df_r.loc[elem[1]])

matching_dataset_left = pd.DataFrame(left)
matching_dataset_right = pd.DataFrame(right)
matching_dataset_left = matching_dataset_left.reset_index(drop=True)
matching_dataset_right = matching_dataset_right.reset_index(drop=True)

difference_l = pd.concat([df_l, matching_dataset_left]).drop_duplicates(keep=False)
difference_r = pd.concat([df_r, matching_dataset_right]).drop_duplicates(keep=False)

match_join_df = join(matching_dataset_left, matching_dataset_right)
display(match_join_df)



features_difference, feature_value_difference = get_features(difference_l, difference_r, 0.95, True)
pairs_difference = get_pairs(features_difference, feature_value_difference)
features_difference.sum(axis=1).value_counts().sort_index(ascending=False)

left = []
right = []

for elem in pairs_difference:
    left.append(difference_l.loc[elem[0]])
    right.append(difference_r.loc[elem[1]])

matching_difference_dataset_left = pd.DataFrame(left)
matching_difference_dataset_right = pd.DataFrame(right)
matching_difference_dataset_left = matching_difference_dataset_left.reset_index(drop=True)
matching_difference_dataset_right = matching_difference_dataset_right.reset_index(drop=True)

difference_join_df = join(matching_difference_dataset_left, matching_difference_dataset_right)
display(difference_join_df)

result= pd.concat([match_join_df, difference_join_df])
display(result)

matching_difference_dataset_left.columns = remove_suffix(matching_difference_dataset_left.columns.values.tolist())
matching_difference_dataset_right.columns = remove_suffix(matching_difference_dataset_right.columns.values.tolist())

unique_left_df =pd.concat([difference_l, matching_difference_dataset_left]).drop_duplicates(keep=False)
result = pd.concat([result, unique_left_df])

unique_right_df = pd.concat([difference_r, matching_difference_dataset_right]).drop_duplicates(keep=False)
result = pd.concat([result, unique_right_df])

result





Unnamed: 0,name,country,industry,founded,employees,ceo
0,lacework,united states,cybersecurity,2015,,
1,tipalti,united states,fintech,2010,,
2,tempus,united states,health,2015,,
3,anduril,united states,artificial intelligence,2017,,
4,bolt,estonia,auto & transportation,2013,,
...,...,...,...,...,...,...
894,markel corporation,,,1930,20300,thomas sinnickson gayner
895,brown & brown inc,,,1939,12023,j powell brown cpcu cpcu
896,nomura research institute ltd,,,,16196,shingo konomoto
897,mgm resorts international,,,1986,52000,william joseph hornbuckle iv


Traceback (most recent call last):
  File "_pydevd_bundle/pydevd_cython_darwin_310_64.pyx", line 1035, in _pydevd_bundle.pydevd_cython_darwin_310_64.PyDBFrame.trace_dispatch
  File "/Applications/PyCharm.app/Contents/plugins/python/helpers-pro/jupyter_debug/pydev_jupyter_plugin.py", line 144, in cmd_step_over
    if _is_inside_jupyter_cell(frame, pydb):
  File "/Applications/PyCharm.app/Contents/plugins/python/helpers-pro/jupyter_debug/pydev_jupyter_plugin.py", line 209, in _is_inside_jupyter_cell
    if is_cell_filename(filename):
  File "/Applications/PyCharm.app/Contents/plugins/python/helpers-pro/jupyter_debug/pydev_jupyter_plugin.py", line 220, in is_cell_filename
    ipython_shell = get_ipython()
NameError: name 'get_ipython' is not defined


In [None]:
r

In [29]:
unique_left_df =pd.concat([difference_l, matching_difference_dataset_left]).drop_duplicates(keep=False)
result = pd.concat([result, unique_left_df])

In [30]:
result

Unnamed: 0,name,country,industry,founded
0,lacework,united states,cybersecurity,2015
1,tipalti,united states,fintech,2010
2,tempus,united states,health,2015
3,anduril,united states,artificial intelligence,2017
4,bolt,estonia,auto & transportation,2013
...,...,...,...,...
1180,spacex,united states,other,2002
1181,fanatics,united states,e-commerce & direct-to-consumer,1995
1182,instacart,united states,"supply chain, logistics, & delivery",2012
1183,databricks,united states,data management & analytics,2013


In [31]:
unique_right_df = pd.concat([difference_r, matching_difference_dataset_right]).drop_duplicates(keep=False)
result = pd.concat([result, unique_right_df])

In [32]:
result

Unnamed: 0,name,country,industry,founded,employees,ceo
0,lacework,united states,cybersecurity,2015,,
1,tipalti,united states,fintech,2010,,
2,tempus,united states,health,2015,,
3,anduril,united states,artificial intelligence,2017,,
4,bolt,estonia,auto & transportation,2013,,
...,...,...,...,...,...,...
894,markel corporation,,,1930,20300,thomas sinnickson gayner
895,brown & brown inc,,,1939,12023,j powell brown cpcu cpcu
896,nomura research institute ltd,,,,16196,shingo konomoto
897,mgm resorts international,,,1986,52000,william joseph hornbuckle iv


In [653]:
features, feature_value = get_features(df_l, df_r, 0.95, False)
pairs = get_pairs(features, feature_value)



In [654]:
features.sum(axis=1).value_counts().sort_index(ascending=False)

0.0    1065315
dtype: int64

In [656]:
left = []
right = []

for elem in pairs:
    left.append(df_l.loc[elem[0]])
    right.append(df_r.loc[elem[1]])

matching_dataset_left = pd.DataFrame(left)
matching_dataset_right = pd.DataFrame(right)
matching_dataset_left = matching_dataset_left.reset_index(drop=True)
matching_dataset_right = matching_dataset_right.reset_index(drop=True)

In [657]:
matching_dataset_left

In [3]:
def get_features(df_l, df_r, name_threshold, is_name):
    feature_value = 0
    columns_l = df_l.columns.values.tolist()
    columns_r = df_r.columns.values.tolist()
    # set_index
    indexer = recordlinkage.Index()
    indexer.full()
    candidate_links = indexer.index(df_l, df_r)

    # Comparison step
    compare_cl = recordlinkage.Compare()
    compare_cl.string("name", "name", method="levenshtein", threshold=name_threshold, label="name")

    if not is_name:
        if 'ceo' in columns_l and 'ceo' in columns_r:
            compare_cl.string("ceo", "ceo", method="levenshtein", threshold=0.85, label="ceo")
            feature_value = 1

    # compare_cl.exact("country", "country", label="country")

    return compare_cl.compute(candidate_links, df_l, df_r), feature_value


def get_pairs(features, feature_value):
    # Classification step
    pairs = features[features.sum(axis=1) > feature_value]
    return pairs['name'].keys().to_list()


def rename_columns(columns_l, columns_r):
    columns_l = [c + '_l' for c in columns_l]
    columns_r = [c + '_r' for c in columns_r]
    columns_join = columns_l + columns_r
    return columns_l, columns_r


def find_duplicated_columns(columns_l, columns_r):
    attr_cleaned = columns_l + columns_r
    duplicates = [item for item, count in collections.Counter(attr_cleaned).items() if count > 1]
    return duplicates

def join(matching_dataset_left, matching_dataset_right):
    column_left, column_right = rename_columns(matching_dataset_left.columns.values.tolist(),
                                           matching_dataset_right.columns.values.tolist())
    matching_dataset_left.columns = column_left
    matching_dataset_right.columns = column_right
    joined_df = pd.concat([matching_dataset_left, matching_dataset_right], axis=1)

    duplicates = find_duplicated_columns(df_l.columns.values.tolist(), df_r.columns.values.tolist())
    for col in duplicates:
        candidates = joined_df[[col + '_l', col + '_r']].sample(20)
        candidate_labels = candidates.columns.values.tolist()
        display(candidates.sample(10))
        time.sleep(1)
        idx_drop = int(input("Inserisci l'indice (0 o 1) della colonna che vuoi scartare nel dataset finale: "))
        joined_df.drop(candidate_labels[idx_drop], axis=1, inplace=True)

    joined_df.columns = [col[:-2] for col in joined_df.columns.values.tolist()]
    return joined_df

In [658]:
matching_dataset_right

In [659]:
difference_l = pd.concat([df_l, matching_dataset_left]).drop_duplicates(keep=False)
difference_r = pd.concat([df_r, matching_dataset_right]).drop_duplicates(keep=False)

In [660]:
difference_l

Unnamed: 0,name,country,industry,founded
0,lacework,united states,cybersecurity,2015
1,tipalti,united states,fintech,2010
2,tempus,united states,health,2015
3,anduril,united states,artificial intelligence,2017
4,bolt,estonia,auto & transportation,2013
...,...,...,...,...
1180,spacex,united states,other,2002
1181,fanatics,united states,e-commerce & direct-to-consumer,1995
1182,instacart,united states,"supply chain, logistics, & delivery",2012
1183,databricks,united states,data management & analytics,2013


In [661]:
difference_r

Unnamed: 0,founded,employees,ceo,name
0,"april 1, 1976",100000,timothy d cook,apple
1,"april 4, 1975",181000,satya nadella,microsoft
2,,,amin h nasser,aramco
3,"october 2, 2015",156500,sundar pichai,alphabet
4,"july 5, 1994",1608000,andrew r jassy,amazon
...,...,...,...,...
894,1930,20300,thomas sinnickson gayner,markel corporation
895,1939,12023,j powell brown cpcu cpcu,brown & brown inc
896,,16196,shingo konomoto,nomura research institute ltd
897,1986,52000,william joseph hornbuckle iv,mgm resorts international


In [663]:
match_join_df = join(matching_dataset_left, matching_dataset_right)

KeyError: "None of [Index(['name_l', 'name_r'], dtype='object')] are in the [columns]"

In [644]:
features_difference, feature_value_difference = get_features(difference_l, difference_r, 0.95, True)
pairs_difference = get_pairs(features_difference, feature_value_difference)



In [645]:
features_difference.sum(axis=1).value_counts().sort_index(ascending=False)

1.0      171
0.0    46341
dtype: int64

In [646]:
left = []
right = []

for elem in pairs_difference:
    left.append(difference_l.loc[elem[0]])
    right.append(difference_r.loc[elem[1]])

matching_difference_dataset_left = pd.DataFrame(left)
matching_difference_dataset_right = pd.DataFrame(right)
matching_difference_dataset_left = matching_difference_dataset_left.reset_index(drop=True)
matching_difference_dataset_right = matching_difference_dataset_right.reset_index(drop=True)

In [647]:
difference_join_df = join(matching_dataset_left, matching_dataset_right)
difference_join_df

KeyError: "None of [Index(['name_l', 'name_r'], dtype='object')] are in the [columns]"

In [591]:
result = pd.concat([match_join_df, difference_join_df])
result

Unnamed: 0,country,continent,market_cap,categories,founded,employees,ceo,name
0,cupertino,ca,2825000000000,"unitedstates, technology, consumerelectronics,...","april 1, 1976",100000,timothy d cook,apple
1,washington,dc,208030000000,"unitedstates, healthcare, diagnostics&research...",1969,78000,rainer m blair,danaher
2,kenilworth,nj,210380000000,"unitedstates, healthcare, drugmanufacturersgen...",1891,67000,robert m davis jd,merck
3,melbourne,vic,a$281.66 billion,"australia, basicmaterials, otherindustrialmeta...",1885,40110,mike p henry bsc bsc chem,bhp
4,beaverton,or,213600000000,"unitedstates, consumerdiscretionary, footwear&...","january 25, 1964",73300,john j donahoe ii,nike
...,...,...,...,...,...,...,...,...
166,riyadh,saudiarabia,sr76.56 billion,"saudiarabia, financials, banks, saudiarabiafin...",,3280,,alinma bank
167,london,unitedkingdom,15.84 billion,"unitedkingdom, financials, banks, unitedkingdo...","as chartered bank of india, australia, and chi...",81770,,standard chartered plc
168,,,hk$165.86 billion,"hongkong, consumerstaples, packagedfoods, hong...",,,,china mengniu dairy co ltd
169,,,cn134.96 billion,"china, basicmaterials, otherindustrialmetals&m...",,,,china northern rare earth


In [596]:
unique_l = pd.concat([difference_l, matching_dataset_left]).drop_duplicates(keep=False)
unique_r = pd.concat([difference_r, matching_dataset_right]).drop_duplicates(keep=False)

In [597]:
unique_l

Unnamed: 0,name,country,continent,founded,employees,ceo,market_cap,categories,name_l,country_l,continent_l,founded_l,employees_l,ceo_l,market_cap_l,categories_l
15,icbc,beijing,china,,439787,,cn1.593 trillion,"china, financials, banks, chinafinancials, chi...",,,,,,,,
27,kweichow moutai,renhuai,china,,29031,,cn2.216 trillion,"china, consumerstaples, beverageswineries&dist...",,,,,,,,
57,agricultural bank of china,beijing,china,,459000,,cn1.050 trillion,"china, financials, banks, chinafinancials, chi...",,,,,,,,
66,catl,ningde,china,,33078,,cn1.181 trillion,"china, industrials, electricalequipment&parts,...",,,,,,,,
69,china construction bank,beijing,china,,373814,,cn1.205 trillion,"china, financials, banks, chinafinancials, chi...",,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,,,,,,,,,alinma bank,riyadh,saudiarabia,,3280,,sr76.56 billion,"saudiarabia, financials, banks, saudiarabiafin..."
167,,,,,,,,,standard chartered plc,london,unitedkingdom,"as chartered bank of india, australia, and chi...",81770,,15.84 billion,"unitedkingdom, financials, banks, unitedkingdo..."
168,,,,,,,,,china mengniu dairy co ltd,,,,,,hk$165.86 billion,"hongkong, consumerstaples, packagedfoods, hong..."
169,,,,,,,,,china northern rare earth,,,,,,cn134.96 billion,"china, basicmaterials, otherindustrialmetals&m..."


In [601]:
difference_l

Unnamed: 0,name,country,continent,founded,employees,ceo,market_cap,categories
15,icbc,beijing,china,,439787,,cn1.593 trillion,"china, financials, banks, chinafinancials, chi..."
27,kweichow moutai,renhuai,china,,29031,,cn2.216 trillion,"china, consumerstaples, beverageswineries&dist..."
57,agricultural bank of china,beijing,china,,459000,,cn1.050 trillion,"china, financials, banks, chinafinancials, chi..."
66,catl,ningde,china,,33078,,cn1.181 trillion,"china, industrials, electricalequipment&parts,..."
69,china construction bank,beijing,china,,373814,,cn1.205 trillion,"china, financials, banks, chinafinancials, chi..."
...,...,...,...,...,...,...,...,...
995,asahi group holdings ltd,tokyo,japan,,30020,atsushi katsuki,2.231 trillion,"japan, consumerstaples, beveragesbrewers, japa..."
996,haitong securities co ltd,shanghai,china,,11282,,cn115.98 billion,"china, financials, capitalmarkets, chinafinanc..."
997,catalent inc,somerset,nj,2007,17300,john r chiminski,18240000000,"unitedstates, healthcare, drugmanufacturersspe..."
998,quanta services inc,houston,tx,1997,43700,earl c austin jr,18250000000,"unitedstates, industrials, engineering&constru..."


In [602]:
matching_dataset_left

Unnamed: 0,name_l,country_l,continent_l,founded_l,employees_l,ceo_l,market_cap_l,categories_l
0,icbc,beijing,china,,439787,,cn1.593 trillion,"china, financials, banks, chinafinancials, chi..."
1,kweichow moutai,renhuai,china,,29031,,cn2.216 trillion,"china, consumerstaples, beverageswineries&dist..."
2,agricultural bank of china,beijing,china,,459000,,cn1.050 trillion,"china, financials, banks, chinafinancials, chi..."
3,catl,ningde,china,,33078,,cn1.181 trillion,"china, industrials, electricalequipment&parts,..."
4,china construction bank,beijing,china,,373814,,cn1.205 trillion,"china, financials, banks, chinafinancials, chi..."
...,...,...,...,...,...,...,...,...
166,alinma bank,riyadh,saudiarabia,,3280,,sr76.56 billion,"saudiarabia, financials, banks, saudiarabiafin..."
167,standard chartered plc,london,unitedkingdom,"as chartered bank of india, australia, and chi...",81770,,15.84 billion,"unitedkingdom, financials, banks, unitedkingdo..."
168,china mengniu dairy co ltd,,,,,,hk$165.86 billion,"hongkong, consumerstaples, packagedfoods, hong..."
169,china northern rare earth,,,,,,cn134.96 billion,"china, basicmaterials, otherindustrialmetals&m..."


In [462]:
column_left, column_right = rename_columns(matching_dataset_left.columns.values.tolist(),
                                           matching_dataset_right.columns.values.tolist())
matching_dataset_left.columns = column_left
matching_dataset_right.columns = column_right

In [463]:
joined_df = pd.concat([matching_dataset_left, matching_dataset_right], axis=1)
joined_df

Unnamed: 0,name_l,country_l,continent_l,founded_l,employees_l,ceo_l,market_cap_l,categories_l,founded_r,employees_r,ceo_r,name_r
0,apple,cupertino,ca,"april 1, 1976",100000,timothy d cook,2825000000000,"unitedstates, technology, consumerelectronics,...","april 1, 1976",100000,timothy d cook,apple
1,danaher,washington,dc,1969,78000,rainer m blair,208030000000,"unitedstates, healthcare, diagnostics&research...",1969,78000,rainer m blair,danaher
2,merck,kenilworth,nj,1891,67000,robert m davis jd,210380000000,"unitedstates, healthcare, drugmanufacturersgen...",1891,67000,robert m davis jd,merck
3,bhp,melbourne,vic,1885,40110,mike p henry bsc bsc chem,a$281.66 billion,"australia, basicmaterials, otherindustrialmeta...",1885,40110,mike p henry bsc bsc chem,bhp
4,nike,beaverton,or,"january 25, 1964",73300,john j donahoe ii,213600000000,"unitedstates, consumerdiscretionary, footwear&...","january 25, 1964",73300,john j donahoe ii,nike
...,...,...,...,...,...,...,...,...,...,...,...,...
723,rede d'or sao luiz sa,saopaulo,sp,,,paulo junqueira moll,r$99.71 billion,"brazil, healthcare, medicalcarefacilities, bra...",,,paulo junqueira moll,rede d'or sao luiz sa
724,teck resources limited,vancouver,bc,,10600,donald r lindsay bsc honours mba,c$26.81 billion,"canada, basicmaterials, otherindustrialmetals&...",,10600,donald r lindsay bsc honours mba,teck resources limited
725,the saudi british bank,riyadh,saudiarabia,,4156,anthony william cripps,sr80.45 billion,"saudiarabia, financials, banks, saudiarabiafin...",,4156,anthony william cripps,the saudi british bank
726,mitsui fudosan co ltd,tokyo,japan,,23992,masanobu komoda,2.629 trillion,"japan, realestate, realestatediversified, japa...",,23992,masanobu komoda,mitsui fudosan co ltd


In [464]:
duplicates = find_duplicated_columns(df_l.columns.values.tolist(), df_r.columns.values.tolist())
duplicates

['name', 'founded', 'employees', 'ceo']

In [465]:
for col in duplicates:
    candidates = joined_df[[col + '_l', col + '_r']].sample(20)
    candidate_labels = candidates.columns.values.tolist()
    display(candidates.sample(10))
    time.sleep(1)
    idx_drop = int(input("Inserisci l'indice (0 o 1) della colonna che vuoi scartare nel dataset finale: "))
    joined_df.drop(candidate_labels[idx_drop], axis=1, inplace=True)


Unnamed: 0,name_l,name_r
649,zebra technologies corporation,zebra technologies corporation
657,geberit ag,geberit ag
377,simon property group inc,simon property group inc
504,naturgy energy group sa,naturgy energy group sa
565,caixabank sa,caixabank sa
257,marvell technology group ltd,marvell technology group ltd
479,oneok inc,oneok inc
609,mid-america apartment communities inc,mid-america apartment communities inc
562,sartorius,sartorius
587,albemarle corporation,albemarle corporation


Unnamed: 0,founded_l,founded_r
361,1883.0,1883.0
471,,
662,,
460,1995.0,1995.0
242,,
511,,
424,1969.0,1969.0
478,1989.0,1989.0
627,,
655,1946.0,1946.0


Unnamed: 0,employees_l,employees_r
311,47099,47099
459,52450,52450
608,316078,316078
151,27605,27605
717,2742,2742
219,50000,50000
365,77958,77958
272,47000,47000
541,26000,26000
560,22700,22700


IndexError: list index out of range

In [435]:
joined_df.columns = [col[:-2] for col in joined_df.columns.values.tolist()]
joined_df

Unnamed: 0,country,continent,market_cap,categories,founded,employees,ceo,name
0,cupertino,ca,2825000000000,"unitedstates, technology, consumerelectronics,...","april 1, 1976",100000,timothy d cook,apple
1,washington,dc,208030000000,"unitedstates, healthcare, diagnostics&research...",1969,78000,rainer m blair,danaher
2,kenilworth,nj,210380000000,"unitedstates, healthcare, drugmanufacturersgen...",1891,67000,robert m davis jd,merck
3,melbourne,vic,a$281.66 billion,"australia, basicmaterials, otherindustrialmeta...",1885,40110,mike p henry bsc bsc chem,bhp
4,beaverton,or,213600000000,"unitedstates, consumerdiscretionary, footwear&...","january 25, 1964",73300,john j donahoe ii,nike
...,...,...,...,...,...,...,...,...
647,saopaulo,sp,r$99.71 billion,"brazil, healthcare, medicalcarefacilities, bra...",,,paulo junqueira moll,rede d'or sao luiz sa
648,vancouver,bc,c$26.81 billion,"canada, basicmaterials, otherindustrialmetals&...",,10600,donald r lindsay bsc honours mba,teck resources limited
649,riyadh,saudiarabia,sr80.45 billion,"saudiarabia, financials, banks, saudiarabiafin...",,4156,anthony william cripps,the saudi british bank
650,tokyo,japan,2.629 trillion,"japan, realestate, realestatediversified, japa...",,23992,masanobu komoda,mitsui fudosan co ltd


problema con record che hanno valori vuoti

In [506]:
df_r.loc[df_r['name'] == 'abb ltd']

Unnamed: 0,founded,employees,ceo,name
229,1988,104400,,abb ltd


In [507]:
df_l.loc[df_l['name'] == 'abb ltd']

Unnamed: 0,name,country,continent,founded,employees,ceo,market_cap,categories
220,abb ltd,zurich,switzerland,1988,104400,,chf62.20 billion,"switzerland, industrials, electricalequipment&..."


In [510]:
df_r['name'][229] == df_l['name'][220]

True

In [512]:
df_r['ceo'][229] == df_l['ceo'][220]

True

In [533]:
empty_l = difference_l.loc[difference_l['ceo'] == '']
empty_l

Unnamed: 0,name,country,continent,founded,employees,ceo,market_cap,categories
15,icbc,beijing,china,,439787,,cn1.593 trillion,"china, financials, banks, chinafinancials, chi..."
27,kweichow moutai,renhuai,china,,29031,,cn2.216 trillion,"china, consumerstaples, beverageswineries&dist..."
57,agricultural bank of china,beijing,china,,459000,,cn1.050 trillion,"china, financials, banks, chinafinancials, chi..."
66,catl,ningde,china,,33078,,cn1.181 trillion,"china, industrials, electricalequipment&parts,..."
69,china construction bank,beijing,china,,373814,,cn1.205 trillion,"china, financials, banks, chinafinancials, chi..."
...,...,...,...,...,...,...,...,...
976,astra international tbk pt,jakarta,indonesia,,123894,,17.11 billion,"indonesia, consumerdiscretionary, autoparts, i..."
984,nice ltd,raaanana,israel,1986,6800,,594.2 million,"israel, technology, softwareapplication, israe..."
987,zhejiang huayou cobalt co ltd,,,,,,cn122.16 billion,"china, basicmaterials, otherindustrialmetals&m..."
993,restaurant brands international inc,toronto,on,"december 15, 2014",5700,,c$22.78 billion,"canada, consumerdiscretionary, restaurants, ca..."


In [534]:
empty_r = difference_r.loc[difference_r['ceo'] == '']
empty_r

Unnamed: 0,founded,employees,ceo,name
23,,29031,,kweichow moutai
35,,439787,,icbc
51,march 1996,104323,,novartis
58,,373814,,china construction bank
61,,33078,,catl
...,...,...,...,...
877,"as chartered bank of india, australia, and c...",81770,,standard chartered plc
888,,3280,,alinma bank
891,,15198,,guotai junan securities co ltd
892,,27318,,nippon paint holdings co ltd


In [504]:
difference_l['name'][220] == difference_r['name'][220]

KeyError: 220

In [540]:
features_empty, feature_value_empty = get_features(difference_l, difference_r, 0.95, True)
pairs_empty = get_pairs(features_empty, feature_value_empty)



In [541]:
feature_value_empty

0

In [543]:
difference_l['name'][15]

'icbc'

In [544]:
difference_r['name'][35]

'icbc'

In [542]:
features_empty

Unnamed: 0,Unnamed: 1,name
15,23,0.0
15,35,1.0
15,51,0.0
15,58,0.0
15,61,0.0
...,...,...
999,877,0.0
999,888,0.0
999,891,0.0
999,892,0.0


In [545]:
features_empty.sum(axis=1).value_counts().sort_index(ascending=False)

1.0      171
0.0    46341
dtype: int64