In [1]:
import pickle

all_names1 = pickle.load(open("site1_names.pkl", "rb"))
all_names2 = pickle.load(open("site2_names.pkl", "rb"))

print("site1 names uniques:", len(set(all_names1)) == len(all_names1))
print("site2 names uniques:", len(set(all_names1)) == len(all_names1))

site1 names uniques: True
site2 names uniques: True


In [2]:
import re
import numpy as np
from itertools import product
from scipy.optimize import linear_sum_assignment
from nltk.metrics import edit_distance
from multiprocessing import Pool, cpu_count

N=200
M=1000

# Computing a mapping in larger sets to lower the unmatchable names in the N firsts
names1 = all_names1[:M]
names2 = all_names2[:M]

def col(name):
    return np.array([edit_distance(name, n) for n in names2])

p = Pool(cpu_count())
print("Computing cost matrix using %d workers..." % cpu_count(), end="")
costs = np.array(p.map(col, names1))
print("done")

Computing cost matrix using 8 workers...done


In [3]:
print("Computing optimal assigment...", end="")
id_n1, id_n2 = linear_sum_assignment(costs)
sol_costs = costs[id_n1[:N], id_n2[:N]]
print("Done: cost of solution = %d" % sol_costs.sum())

Computing optimal assigment...Done: cost of solution = 819


In [5]:
def cost_threshold(id1, id2):
    return sol_costs[id1] < min(len(names1[id1]), len(names2[id2]))*0.5

name_map = {names1[id1]: names2[id2] for id1, id2 in zip(id_n1[:N], id_n2[:N]) if cost_threshold(id1, id2)}

pickle.dump(name_map, open("name_map.pkl", "wb"))

print("Name mapping of size %d." % len(name_map))

Name mapping of size 177.


In [6]:
import pandas as pd
from IPython.display import display, HTML

s1 = pd.read_pickle("site1.pkl")
s2 = pd.read_pickle("site2.pkl")
s2 = s2.loc[id_n2[:N]].reset_index(drop=True)
s = s1.join(s2, rsuffix="_2")
s = s.dropna() # Removed uni that are s

s.drop(["url", "url_2"], axis=1, inplace=True)

s = s.replace("Russian Federation", "Russia")
s = s[s["country"] == s["country_2"]].drop("country_2", axis=1) # Remove unmatching countries 

display(s[["name", "name_2"]][s["name"] != s["name_2"]]) # Shows a good quality of matching

s.drop("name_2", axis=1, inplace=True)
s.head()

Unnamed: 0,name,name_2
0,Massachusetts Institute of Technology (MIT),Massachusetts Institute of Technology
3,California Institute of Technology (Caltech),California Institute of Technology
6,UCL (University College London),University College London
9,ETH Zurich - Swiss Federal Institute of Techno...,ETH Zurich – Swiss Federal Institute of Techno...
10,"Nanyang Technological University, Singapore (NTU)","Nanyang Technological University, Singapore"
11,Ecole Polytechnique Fédérale de Lausanne (EPFL),École Polytechnique Fédérale de Lausanne
14,National University of Singapore (NUS),National University of Singapore
19,The Australian National University,Australian National University
22,The University of Edinburgh,University of Edinburgh
23,King's College London,King’s College London


Unnamed: 0,name,rank,country,region,fac_c_inter,fac_c_total,stu_c_inter,stu_c_total,rank_2,pc_intl_students(%),student_staff_ratio
0,Massachusetts Institute of Technology (MIT),1,United States,North America,1679.0,2982.0,3717.0,11067.0,5.0,34.0,8.7
1,Stanford University,2,United States,North America,2042.0,4285.0,3611.0,15878.0,3.0,22.0,7.5
2,Harvard University,3,United States,North America,1311.0,4350.0,5266.0,22429.0,6.0,26.0,8.9
3,California Institute of Technology (Caltech),4,United States,North America,350.0,953.0,647.0,2255.0,3.0,27.0,6.5
4,University of Cambridge,5,United Kingdom,Europe,2278.0,5490.0,6699.0,18770.0,2.0,35.0,10.9


In [7]:
s.to_pickle("merged.pkl")

N = 200, M=200 : 1119

N = 200, M=300 : 969

N = 200, M=400 : 875

N = 200, M=500 : 860

N = 200, M=800 : 822

N = 200, M=1000: 819