In [8]:
import pickle

all_names1 = pickle.load(open("site1_names.pkl", "rb"))
all_names2 = pickle.load(open("site2_names.pkl", "rb"))

print("site1 names uniques:", len(set(all_names1)) == len(all_names1))
print("site2 names uniques:", len(set(all_names1)) == len(all_names1))

site1 names uniques: True
site2 names uniques: True


In [2]:
import re
import numpy as np
from itertools import product
from scipy.optimize import linear_sum_assignment
from nltk.metrics import edit_distance
from multiprocessing import Pool, cpu_count

N=200
M=1000

# Computing a mapping in larger sets to lower the unmatchable names in the N firsts
names1 = all_names1[:M]
names2 = all_names2[:M]

def col(name):
    return np.array([edit_distance(name, n) for n in names2])

p = Pool(cpu_count())
print("Computing cost matrix using %d workers..." % cpu_count(), end="")
costs = np.array(p.map(col, names1))
print("done")

In [3]:
print("Computing optimal assigment...", end="")
id_n1, id_n2 = linear_sum_assignment(costs)
sol_costs = costs[id_n1[:N], id_n2[:N]]
print("Done: cost of solution = %d" % sol_costs.sum())

In [5]:
def cost_threshold(id1, id2):
    return sol_costs[id1] < min(len(names1[id1]), len(names2[id2]))*0.5

name_map = {names1[id1]: names2[id2] for id1, id2 in zip(id_n1[:N], id_n2[:N]) if cost_threshold(id1, id2)}

pickle.dump(name_map, open("name_map.pkl", "wb"))

print("Name mapping of size %d." % len(name_map))

In [6]:
import pandas as pd
from IPython.display import display, HTML

s1 = pd.read_pickle("site1.pkl")
s2 = pd.read_pickle("site2.pkl")
s2 = s2.loc[id_n2[:N]].reset_index(drop=True)
s = s1.join(s2, rsuffix="_2")
s = s.dropna() # Removed uni that are s

s.drop(["url", "url_2"], axis=1, inplace=True)

s = s.replace("Russian Federation", "Russia")
s = s[s["country"] == s["country_2"]].drop("country_2", axis=1) # Remove unmatching countries 

display(s[["name", "name_2"]][s["name"] != s["name_2"]]) # Shows a good quality of matching

s.drop("name_2", axis=1, inplace=True)
s.head()

In [7]:
s.to_pickle("merged.pkl")

M=200 : 1119

M=300 : 969

M=400 : 875

M=500 : 860

M=800 : 822

M=1000: 819