In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

The following URL returns a JSON file with the University rankink already parsed:

In [None]:
SITE = "https://www.topuniversities.com"
URL = SITE+"/sites/default/files/qs-rankings-data/357051.txt"

In [None]:
N = 200
r = requests.get(URL)
all_data = r.json()
print(list(all_data.keys()))
data = all_data.get("data")[:N]
print(list(data[0].keys()))
names = [u.get("title") for u in all_data.get("data")]
print("Extracted %d names" % len(names))

In [None]:
s = requests.Session()
reqs = [(req_id, s.prepare_request(requests.Request('GET', SITE+entry["url"]))) for req_id, entry in enumerate(data)]
print(len(reqs), "requests to be sent.")

resps = [(req_id, s.send(req)) for req_id, req in reqs]

done = [(req_id, resp.text) for req_id, resp in resps if resp.status_code == 200]
failed = [(req_id, resp) for req_id, resp in resps if resp.status_code != 200]

print("%d done, %d failed." % (len(done), len(failed)))

In [None]:
import re
from multiprocessing import Pool, cpu_count

non_digit = re.compile('[^0-9]')

class_to_labels = {"total+faculty": "fac_c_total",
                   "inter+faculty": "fac_c_inter",
                  "total+student":"stu_c_total",
                  "total+inter":"stu_c_inter"}


def resp_to_counts(req):
    req_id, resp = req
    page = BeautifulSoup(resp, "html.parser")
    top = page.body.find("div", class_="view-academic-data-profile")
    numdivs = top.find_all("div", class_="number")
    
    def get_label(div):
        if div == top:
            return None
        label = class_to_labels.get("+".join(div.get("class")))
        return label or get_label(div.parent)
    
    fac_counts = {(get_label(div), int(re.sub(non_digit,'', div.string))) for div in numdivs}
    return req_id, fac_counts

print("Parsing responses using up to %d threads..." % cpu_count()) 
with Pool(cpu_count()) as p:
    for req_id, counts in p.map(resp_to_counts, done):
        data[req_id].update(counts)

In [None]:
basecol = ["title", "rank_display", "country", "region"]
addedcol = ["fac_c_inter", "fac_c_total", "stu_c_inter", "stu_c_total"]
uni = pd.DataFrame(data, columns= basecol+addedcol+["url"])
uni.rename(columns={"title":"name", "rank_display": "rank"},inplace=True)
# Convert the rank to a numerical type
uni["rank"] = uni["rank"].str.extract('(\d+)', expand=False).astype(int)
uni.head()

In [None]:

import pickle

uni.to_pickle("site1.pkl")
pickle.dump(names, open('site1_names.pkl', 'wb'))