In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

The following URL returns a JSON file with the University rankink already parsed:

In [None]:
SITE = "https://www.topuniversities.com"
URL = SITE+"/sites/default/files/qs-rankings-data/357051.txt"

In [None]:
N = 10
r = requests.get(URL)
all_data = r.json()
print(list(all_data.keys()))
data = all_data.get("data")[:N]
print(list(data[0].keys()))

In [None]:
s = requests.Session()
reqs = [(req_id, s.prepare_request(requests.Request('GET', SITE+entry["url"]))) for req_id, entry in enumerate(data)]
print(len(reqs), "requests to be sent.")

resps = [(req_id, s.send(req)) for req_id, req in reqs]

done = [(req_id, resp.text) for req_id, resp in resps if resp.status_code == 200]
failed = [(req_id, resp) for req_id, resp in resps if resp.status_code != 200]

print("%d done, %d failed." % (len(done), len(failed)))

In [None]:
import re
non_digit = re.compile('[^0-9]')

NEW_COLS = ("fac_c_total", "fac_c_inter", "stu_c_total", "stu_c_inter")

from multiprocessing import Pool, cpu_count

def resp_to_counts(req):
    req_id, resp = req
    page = BeautifulSoup(resp, "html.parser")
    div = page.body.find("div", class_="view-academic-data-profile")
    fac_counts = [int(re.sub(non_digit,'', tot.string)) for tot in div.find_all("div", class_="number")]
    return req_id, dict(zip(NEW_COLS, fac_counts))

print("Parsing responses using up to %d threads..." % cpu_count()) 
with Pool(cpu_count()) as p:
    for req_id, counts in p.map(resp_to_counts, done):
        data[req_id].update(counts)

In [None]:
uni = pd.DataFrame(data, columns=["title", "rank_display", "country", "region"]+list(NEW_COLS)+["url"])
uni.rename(columns={"title":"name", "rank_display": "rank"},inplace=True)
# Convert the rank to a numerical type
uni["rank"] = uni["rank"].str.extract('(\d+)', expand=False).astype(int)
uni.head()

In [None]:
uni.describe()