In [None]:
import requests as rq
import pandas as pd
import bs4
import re
import multiprocessing as mp
import numpy as np
import matplotlib.pyplot as plt

from itertools import product
from scipy.optimize import linear_sum_assignment
from nltk.metrics import edit_distance
from IPython.display import display, HTML

In [None]:
N = 200
M = 400

SITE1 = "https://www.topuniversities.com"
URL1 = SITE1+"/sites/default/files/qs-rankings-data/357051.txt"

SITE2 = "https://www.timeshighereducation.com"
URL2 = SITE2+"/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json"

In [None]:
data1, data2 = (rq.get(URL).json().get("data") for URL in (URL1, URL2))

display(sorted(list(data1[0].keys())))
display(sorted(list(data2[0].keys())))

names1, names2 = ([u.get(key) for u in data[:M]] for data, key in ((data1, 'title'), (data2, 'name')))
print("Extracted %d and %d names" % (len(names1), len(names2)))

In [None]:
s = rq.Session()
reqs = [(req_id, s.prepare_request(rq.Request('GET', SITE1+entry["url"]))) for req_id, entry in enumerate(data1[:N])]
print(len(reqs), "requests to be sent.")

resps = [(req_id, s.send(req)) for req_id, req in reqs]

done = [(req_id, resp.text) for req_id, resp in resps if resp.status_code == 200]
failed = [(req_id, resp) for req_id, resp in resps if resp.status_code != 200]

print("%d done, %d failed." % (len(done), len(failed)))

non_digit = re.compile('[^0-9]')

class_to_labels = {"total+faculty": "fac_c_total",
                   "inter+faculty": "fac_c_inter",
                  "total+student":"stu_c_total",
                  "total+inter":"stu_c_inter"}


def resp_to_counts(req):
    req_id, resp = req
    page = bs4.BeautifulSoup(resp, "html.parser")
    top = page.body.find("div", class_="view-academic-data-profile")
    numdivs = top.find_all("div", class_="number")
    
    def get_label(div):
        if div == top:
            return None
        label = class_to_labels.get("+".join(div.get("class")))
        return label or get_label(div.parent)
    
    fac_counts = {(get_label(div), int(re.sub(non_digit,'', div.string))) for div in numdivs}
    return req_id, fac_counts

print("Parsing responses using up to %d threads..." % mp.cpu_count(), end="") 
with mp.Pool(mp.cpu_count()) as p:
    for req_id, counts in p.map(resp_to_counts, done):
        data1[req_id].update(counts)
print("done")

In [None]:
basecol = ["title", "rank_display", "country", "region"]
addedcol = ["fac_c_inter", "fac_c_total", "stu_c_inter", "stu_c_total"]

uni_s1 = pd.DataFrame(data1[:N], columns= basecol+addedcol)
uni_s1.rename(columns={"title":"name", "rank_display": "rank"},inplace=True)
# Convert the rank to a numerical type
uni_s1["rank"] = uni_s1["rank"].str.extract('(\d+)', expand=False).astype(int)
uni_s1.head()

In [None]:
basecol = ["name", "rank", "location", "stats_pc_intl_students", "stats_student_staff_ratio"]

uni_s2 = pd.DataFrame(data2[:N], columns=basecol)
uni_s2.rename(columns={"location":"country","stats_pc_intl_students":"pc_intl_students", "stats_student_staff_ratio":"student_staff_ratio"},inplace=True)
uni_s2["rank"] = uni_s2["rank"].str.extract('(\d+)', expand=False).astype(int)
uni_s2["pc_intl_students"]= uni_s2["pc_intl_students"].str.extract('(\d+)', expand=False).astype(float) / 100
uni_s2.head()

M=200 : 1119

M=300 : 969

M=400 : 875

M=500 : 860

M=800 : 822

M=1000: 819

In [None]:
def col(name):
    return np.array([edit_distance(name, n) for n in names2])

p = mp.Pool(mp.cpu_count())
print("Computing cost matrix using %d workers..." % mp.cpu_count(), end="")

costs = np.array(p.map(col, names1))
print("done")

print("Computing optimal assigment...", end="")
id_n1, id_n2 = linear_sum_assignment(costs)
sol_costs = costs[id_n1[:N], id_n2[:N]]
print("Done: cost of solution = %d" % sol_costs.sum())


In [None]:
uni_m = uni_s1.join(uni_s2.loc[id_n2[:N]].reset_index(drop=True), rsuffix="_2")
uni_m.dropna(inplace=True) # Removed uni that are s

uni_m.replace("Russian Federation", "Russia", inplace=True)
uni_m = uni_m[uni_m["country"] == uni_m["country_2"]].drop("country_2", axis=1) # Remove unmatching countries 


print("Merged dataset is of size: %s" % len(uni_m))
display(uni_m[["name", "name_2"]][uni_m["name"] != uni_m["name_2"]]) # Shows a good quality of matching

uni_m.drop("name_2", axis=1, inplace=True)
uni_m.head()

uni_s1 and uni_s2 are the dataframes corresponding to the rankings from the first and second websites respectively. For each university, we use the data from the columns with values for the total number of students, faculty members and international students to compute the student/staff ratio and the proportion of international students for each university. We then sort the dataframes according to each ratio to find the best universities with respect to each.

In [None]:
#ratio computations for s1 only because they are already available for s2
uni_s1['staff_student_ratio'] = uni_s1.apply(lambda row: row.fac_c_total/row.stu_c_total, axis=1)
uni_s1['pc_intl_students'] = uni_s1.apply(lambda row: (row.stu_c_inter/row.stu_c_total), axis=1)

#transforming of the ratios for s2 so that they are comparable with the data for s1
uni_s2['staff_student_ratio'] = uni_s2.apply(lambda row: 1/float(row.student_staff_ratio), axis=1)


#sorting of data with respect to each ratio
display(uni_s1.sort_values('staff_student_ratio', ascending=False).head())
display(uni_s1.sort_values('pc_intl_students', ascending=False).head())
display(uni_s2.sort_values('staff_student_ratio', ascending=False).head())
display(uni_s2.sort_values('pc_intl_students', ascending=False).head())

According to website 1 the two best universities with respect to the staff/student ratio are Caltech and Yale. The two best universities with respect to the proportion of international students are  London School of Economics and Political Sciences and Ecole Polytechnique Fédérale de Lausanne (EPFL).

According to website 2 the two best universities with respect to the staff/student ratio are Vanderbilt University and University of Copenhagen. The two best universities with respect to the proportion of international students are  London School of Economics and Political Sciences and University of Luxembourg.

We now aggregate our results by country and region by grouping the data and computing the mean of the ratios for each group.

In [None]:
uni_s1_countries = uni_s1.pivot_table(index="country", values=["staff_student_ratio", "pc_intl_students"])
uni_s1_regions = uni_s1.pivot_table(index="region", values=["staff_student_ratio", "pc_intl_students"])

uni_s2_countries = uni_s2.pivot_table(index="country", values=["staff_student_ratio", "pc_intl_students"])

def title(s):
    return display(HTML("<H1>%s</H1>" % s))

title("(c) Best countries according to faculty member to student ratio:")
display(uni_s1_countries[["staff_student_ratio"]].sort_values("staff_student_ratio", ascending=False).head())
title("(d) Best countries according to international students ratio:")
display(uni_s1_countries[["pc_intl_students"]].sort_values("pc_intl_students", ascending=False).head())

title("(d) Best regions according to international students ratio:")
display(uni_s1_regions.sort_values(sort_by, ascending=ascending).head())
display(uni_s2_countries.sort_values(sort_by, ascending=ascending).head())

In [None]:
def plot(title, df, figsize=(15, 5), legend=None, **kwargs):
    f, ax = plt.subplots(figsize=figsize)
    df.plot(kind='bar',ax=ax, title=title,legend=True, fontsize=12, **kwargs)
    ax.set_xlabel(df.index.name, fontsize=12)
    ax.set_ylabel("ratio", fontsize=12)
    ax.set_xticklabels(df.index)
    ax.tick_params(axis='x', which='major', pad=15)
    if legend:
        ax.legend(legend)
    display(f)
    plt.close(f)

plot("University mean ratios by region according to site 1's ranking",
     uni_s1_regions.sort_values("pc_intl_students", ascending=ascending),
    legend=['International Students ratio', 'Faculty/Student ratio'])

From the plot above, we notice that the best region according to site 1 in terms of proportion of international students is Oceania and followed by Europe then North America. In terms of proportion of Faculty Members to Students the best region is North America followed by Asia and Europe. 

In [None]:
plot("University Faculty/Student mean ratio by country according to site 1's ranking",
     uni_s1_countries["staff_student_ratio"].sort_values(ascending=ascending))

plot("University mean proportion of International Students by country according to site 1's ranking",
     uni_s1_countries["pc_intl_students"].sort_values(ascending=ascending))

According to the rankings of the first website, the best countries are United Kingdom and Australia in terms of proportion of International students while the best are Russia and Denmark according to the Faculty Members to the number of Students ratio.

In [None]:
plot("University mean proportion of International Students by country according to site 1's ranking",
     uni_s2_countries["staff_student_ratio"].sort_values(ascending=ascending))

plot("University mean proportion of International Students by country according to site 1's ranking",
     uni_s2_countries["pc_intl_students"].sort_values(ascending=ascending))

According to the rankings of the second website, the best countries are Luxembourg and United Kingdom in terms of proportion of International students while the best are Denmark and Italy according to the Faculty Members to the number of Students ratio.