In [None]:
import requests as rq
import pandas as pd
import bs4
import re
import multiprocessing as mp
from IPython.display import display, HTML

In [None]:
N = 5

SITE1 = "https://www.topuniversities.com"
URL1 = SITE1+"/sites/default/files/qs-rankings-data/357051.txt"

SITE2 = "https://www.timeshighereducation.com"
URL2 = SITE2+"/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json"

In [None]:
data1, data2 = (rq.get(URL).json().get("data") for URL in (URL1, URL2))

display(sorted(list(data1[0].keys())))
display(sorted(list(data2[0].keys())))

names1, names2 = ([u.get("title") for u in data] for data in (data1, data2))
print("Extracted %d and %d names" % (len(names1), len(names2)))

In [None]:
s = requests.Session()
reqs = [(req_id, s.prepare_request(rq.Request('GET', SITE1+entry["url"]))) for req_id, entry in enumerate(data1[:N])]
print(len(reqs), "requests to be sent.")

resps = [(req_id, s.send(req)) for req_id, req in reqs]

done = [(req_id, resp.text) for req_id, resp in resps if resp.status_code == 200]
failed = [(req_id, resp) for req_id, resp in resps if resp.status_code != 200]

print("%d done, %d failed." % (len(done), len(failed)))

non_digit = re.compile('[^0-9]')

class_to_labels = {"total+faculty": "fac_c_total",
                   "inter+faculty": "fac_c_inter",
                  "total+student":"stu_c_total",
                  "total+inter":"stu_c_inter"}


def resp_to_counts(req):
    req_id, resp = req
    page = bs4.BeautifulSoup(resp, "html.parser")
    top = page.body.find("div", class_="view-academic-data-profile")
    numdivs = top.find_all("div", class_="number")
    
    def get_label(div):
        if div == top:
            return None
        label = class_to_labels.get("+".join(div.get("class")))
        return label or get_label(div.parent)
    
    fac_counts = {(get_label(div), int(re.sub(non_digit,'', div.string))) for div in numdivs}
    return req_id, fac_counts

print("Parsing responses using up to %d threads..." % mp.cpu_count(), end="") 
with mp.Pool(mp.cpu_count()) as p:
    for req_id, counts in p.map(resp_to_counts, done):
        data1[req_id].update(counts)
print("done")

In [None]:
basecol = ["title", "rank_display", "country", "region"]
addedcol = ["fac_c_inter", "fac_c_total", "stu_c_inter", "stu_c_total"]

uni_s1 = pd.DataFrame(data1[:N], columns= basecol+addedcol)
uni_s1.rename(columns={"title":"name", "rank_display": "rank"},inplace=True)
# Convert the rank to a numerical type
uni_s1["rank"] = uni_s1["rank"].str.extract('(\d+)', expand=False).astype(int)
uni_s1.head()

In [None]:
basecol = ["name", "rank", "location", "stats_pc_intl_students", "stats_student_staff_ratio", "url"]

uni_s2 = pd.DataFrame(data2[:N], columns=basecol)
uni_s2.rename(columns={"location":"country","stats_pc_intl_students":"pc_intl_students", "stats_student_staff_ratio":"student_staff_ratio"},inplace=True)
uni_s2["rank"] = uni_s2["rank"].str.extract('(\d+)', expand=False).astype(int)
uni_s2["pc_intl_students"]= uni_s2["pc_intl_students"].str.extract('(\d+)', expand=False).astype(float) / 100
uni_s2.head()