# Record Linkage

In [1]:
import recordlinkage
import pandas as pd
import numpy as np

In [2]:
PATH_DS_A = './Mediated Datasets/cbinsights_DDD_m.jsonl'
PATH_DS_B = './Mediated Datasets/companiesMarketCap_Avengers_m.jsonl'
THRESHOLD = 0.95

In [3]:
def get_features(df_a, df_b):
    # set_index
    indexer = recordlinkage.Index()
    indexer.full()
    candidate_links = indexer.index(df_a, df_b)
    # Comparison step
    compare_cl = recordlinkage.Compare()
    compare_cl.string("name", "name", method="jarowinkler", threshold=THRESHOLD, label="name")
    compare_cl.exact("country", "country", label="country")

    return compare_cl.compute(candidate_links, df_a, df_b)


def get_pairs(features):
    # Classification step
    pairs = features[features.sum(axis=1) > 1.9]
    return pairs['name'].keys().to_list()

def rename_columns(columns_a, columns_b):
    columns_a = [c + '_a' for c in columns_a]
    columns_b = [c + '_b' for c in columns_b]
    columns_join = columns_a + columns_b
    return columns_a, columns_b, columns_join

In [4]:
df_a = pd.read_json(PATH_DS_A, encoding='utf-8', lines=True, dtype=object)
df_a.head(10)

Unnamed: 0,name,country,industry,founded
0,lacework,united states,cybersecurity,2015
1,tipalti,united states,fintech,2010
2,tempus,united states,health,2015
3,anduril,united states,artificial intelligence,2017
4,bolt,estonia,auto & transportation,2013
5,bytedance,china,artificial intelligence,2012
6,niantic,united states,mobile & telecommunications,2015
7,oyo roo,india,travel,2012
8,kavak,mexico,e-commerce & direct-to-consumer,2014
9,personio,germany,internet software & services,2015


In [5]:
df_b = pd.read_json(PATH_DS_B, encoding='utf-8', lines=True, dtype=object)
df_b.head(10)

Unnamed: 0,name,market_cap,country,share_price,categories
0,apple,2351000000000,usa,147,"tech, software, dow jones, tech hardware, elec..."
1,bristol-myers squibb,172490000000,usa,81,"pharmaceuticals, biotech"
2,wells fargo,175050000000,usa,45,"banks, financial services"
3,nike,175540000000,usa,112,"sports goods, footwear, dow jones, clothing"
4,walt disney,177080000000,usa,99,"entertainment, dow jones"
5,abbott laboratories,188460000000,usa,108,medical devices
6,accenture,189250000000,ireland,300,professional services
7,t-mobile us,189620000000,usa,152,telecommunication
8,novartis,194410000000,switzerland,89,"pharmaceuticals, biotech"
9,toyota,199920000000,japan,146,"automakers, manufacturing"


In [6]:
features = get_features(df_a, df_b)
pairs = get_pairs(features)
print(pairs)

columns_a, columns_b, columns_join = rename_columns(df_a.columns.values.tolist(), df_b.columns.values.tolist())

df_a.columns = columns_a
df_b.columns = columns_b
for c in columns_b:
    df_a[c] = np.nan

idx_b_to_drop = []
for p in pairs:
    idx_a = p[0]
    idx_b = p[1]
    idx_b_to_drop.append(idx_b)
    for c in df_b.columns:
        value_b = df_b.iloc[idx_b][c]
        df_a.loc[idx_a,c] = str(value_b)

for idx in idx_b_to_drop:
    df_b = df_b.drop(idx)

joined_df = pd.concat([df_a, df_b], ignore_index=True, sort=False)
joined_df

[(308, 5836)]


Unnamed: 0,name_a,country_a,industry_a,founded_a,name_b,market_cap_b,country_b,share_price_b,categories_b
0,lacework,united states,cybersecurity,2015,,,,,
1,tipalti,united states,fintech,2010,,,,,
2,tempus,united states,health,2015,,,,,
3,anduril,united states,artificial intelligence,2017,,,,,
4,bolt,estonia,auto & transportation,2013,,,,,
...,...,...,...,...,...,...,...,...,...
7076,,,,,neogen,4240000000,usa,19,
7077,,,,,pnm resources,4240000000,usa,49,
7078,,,,,kinnevik,4250000000,sweden,15,investment
7079,,,,,topgolf callaway brands,4250000000,usa,23,sports goods


In [7]:
matches = features[features.sum(axis=1) > 1.9]
matches

Unnamed: 0,Unnamed: 1,name,country
308,5836,1.0,1


In [8]:
features.sum(axis=1).value_counts().sort_index(ascending=False)

2.0          1
1.0      90933
0.0    6897011
dtype: int64

In [9]:
features.describe()

Unnamed: 0,name,country
count,6987945.0,6987945.0
mean,2.146554e-06,0.01301098
std,0.001465111,0.1133212
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,1.0,1.0
