In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

The following URL returns a JSON file with the University rankink already parsed:

In [3]:
SITE = "https://www.topuniversities.com"
URL = SITE+"/sites/default/files/qs-rankings-data/357051.txt"

In [4]:
N = 10
r = requests.get(URL)
all_data = r.json()
print(list(all_data.keys()))
data = all_data.get("data")[:N]
print(list(data[0].keys()))

['data']
['guide', 'nid', 'score', 'core_id', 'url', 'country', 'cc', 'title', 'stars', 'rank_display', 'logo', 'region']


In [5]:
s = requests.Session()
reqs = [(req_id, s.prepare_request(requests.Request('GET', SITE+entry["url"]))) for req_id, entry in enumerate(data)]
print(len(reqs), "requests to be sent.")

resps = [(req_id, s.send(req)) for req_id, req in reqs]

done = [(req_id, resp.text) for req_id, resp in resps if resp.status_code == 200]
failed = [(req_id, resp) for req_id, resp in resps if resp.status_code != 200]

print("%d done, %d failed." % (len(done), len(failed)))

10 requests to be sent.
10 done, 0 failed.


In [6]:
import re
non_digit = re.compile('[^0-9]')

NEW_COLS = ("fac_c_total", "fac_c_inter", "stu_c_total", "stu_c_inter")

from multiprocessing import Pool, cpu_count

def resp_to_counts(req):
    req_id, resp = req
    page = BeautifulSoup(resp, "html.parser")
    div = page.body.find("div", class_="view-academic-data-profile")
    fac_counts = [int(re.sub(non_digit,'', tot.string)) for tot in div.find_all("div", class_="number")]
    return req_id, dict(zip(NEW_COLS, fac_counts))

print("Parsing responses using up to %d threads..." % cpu_count()) 
with Pool(cpu_count()) as p:
    for req_id, counts in p.map(resp_to_counts, done):
        data[req_id].update(counts)

Parsing responses using up to 4 threads...


In [7]:
uni = pd.DataFrame(data, columns=["title", "rank_display", "country", "region"]+list(NEW_COLS)+["url"])
uni.rename(columns={"title":"name", "rank_display": "rank"},inplace=True)
# Convert the rank to a numerical type
uni["rank"] = uni["rank"].str.extract('(\d+)', expand=False).astype(int)
uni.head()

Unnamed: 0,name,rank,country,region,fac_c_total,fac_c_inter,stu_c_total,stu_c_inter,url
0,Massachusetts Institute of Technology (MIT),1,United States,North America,2982,1679,11067,3717,/universities/massachusetts-institute-technolo...
1,Stanford University,2,United States,North America,4285,2042,15878,3611,/universities/stanford-university
2,Harvard University,3,United States,North America,4350,1311,22429,5266,/universities/harvard-university
3,California Institute of Technology (Caltech),4,United States,North America,953,350,2255,647,/universities/california-institute-technology-...
4,University of Cambridge,5,United Kingdom,Europe,5490,2278,18770,6699,/universities/university-cambridge


In [8]:
uni.describe()

Unnamed: 0,rank,fac_c_total,fac_c_inter,stu_c_total,stu_c_inter
count,10.0,10.0,10.0,10.0,10.0
mean,5.5,4001.1,1777.0,17066.1,6183.5
std,3.02765,1841.976622,816.462151,7547.545745,3906.429238
min,1.0,953.0,350.0,2255.0,647.0
25%,3.25,2603.25,1403.0,14137.25,3637.5
50%,5.5,4107.5,1964.0,17430.0,5982.5
75%,7.75,5205.0,2226.25,19791.25,7510.5
max,10.0,6750.0,2964.0,31080.0,14854.0
