# University Ranking Tracker
This the primary notebook for scraping ranking data from QS, ARWU and TIMES world university and subject rankings.
<br>
**Author: Elliott Zhu**

## Introduction

This note book creates streamlined process to easily access and scrape university ranking data across three major ranking
systems: Times Higher Education, ARWU and QS. It fully incorporates the dynamic web loading structure: Asynchronous JavaScript
 and XML (AJAX) to extract and compose the published data on corresponding websites. It further provides the accurate computed ranking
 positions given the published ranking methodologies.


## Define Utility Function
We use these functions to clean up html tags and get html tables.

In [1]:
import json
import requests
import urllib
from datetime import datetime
from html.parser import HTMLParser
from io import StringIO

import numpy as np
import pandas as pd
from IPython.display import display  # used to print out pretty pandas dataframes
from bs4 import BeautifulSoup
from tqdm import tqdm

def tableDataText(table):
    """Parses a html segment started with tag <table> followed
    by multiple <tr> (table rows) and inner <td> (table data) tags.
    It returns a list of rows with inner columns.
    Accepts only one <th> (table header/data) in the first row.
    """
    def rowgetDataText(tr, coltag='td'): # td (data) or th (header)
        return [td.get_text(strip=True) for td in tr.find_all(coltag)]
    rows = []
    trs = table.find_all('tr')
    headerow = rowgetDataText(trs[0], 'th')
    if headerow: # if there is a header row include first
        rows.append(headerow)
        trs = trs[1:]
    for tr in trs: # for every table row
        rows.append(rowgetDataText(tr, 'td') ) # data row
    return rows

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    try:
        s = MLStripper()
        s.feed(html)
        return s.get_data()
    except:
        return ''

In [None]:
#Load indicator weightings
weightings = np.load('weightings.npy', allow_pickle=True).tolist()
print('\n === Times Higher Education Subjects (Top 5)  ===')
display(weightings['subject_times'].head(5))

In [None]:
print('\n === Times Higher Education Weightings (Top 5) ===')
display(weightings['weightings_times'].head(5))

In [None]:
print('\n === ARWU Subjects (Top 5) ===')
display(weightings['subject_arwu'].head(5))

In [None]:
print('\n === ARWU Weightings (Top 5) ===')
display(weightings['weightings_arwu'].head(5))

In [None]:
print('\n === QS Weightings (Top 5) ===')
display(weightings['weightings_qs'].head(5))

# Times Higher Education

The AJAX logic of [Times Higher Education](http://timeshighereducation.com) is simple and consistent across both the world university ranking and subject
rankings. The current logic works for data ranging from 2011 to 2021. Apply the indicator weighting according to
[Times Higher Education Methodology](https://www.timeshighereducation.com/world-university-rankings/world-university-rankings-2020-methodology)

In [None]:
#Define times scraping function
def fetch_TIMES(rank=None, year=None, weightings=weightings['weightings_times'],page =weightings['subject_times'] ):
        if 'Subject' in rank:
            page = page[page.Subject == rank].Page.reset_index(drop=True)[0]
            url1 = "https://www.timeshighereducation.com/world-university-rankings/" + str(
                year) + "/subject-ranking/" + page
        else:
            url1 = "https://www.timeshighereducation.com/world-university-rankings/" + str(year) + "/world-ranking"
        r = requests.get(url1, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(r.content, "lxml")
        scripts = soup.find_all('script')
        for script in scripts:
            try:
                if 'jQuery.extend' in script.contents[0]:
                    jsonStr = script.contents[0].split('jQuery.extend(Drupal.settings,')[1]
                    jsonStr = jsonStr.rsplit(');', 1)[0]
                    jsonObj = json.loads(jsonStr)
                    url1 = jsonObj['the_data_rankings']['#datatable-1']['ajax']['url']
            except:
                pass

        req = urllib.request.Request(url1, headers={'User-Agent': 'Mozilla/5.0'})
        with urllib.request.urlopen(req) as url:
            data = json.loads(url.read().decode())
        df = pd.DataFrame(data['data'])
        try:
            df = df[['rank', 'name', 'scores_overall',
                     'scores_teaching', 'scores_research', 'scores_citations',
                     'scores_industry_income', 'scores_international_outlook',
                     'location', 'stats_number_students', 'stats_student_staff_ratio',
                     'stats_pc_intl_students', 'stats_female_male_ratio']]
            df.columns = ['Rank', 'University', 'TotalScore',
                          'Teaching', 'Research', 'Citation',
                          'Industry Income', 'International Outlook',
                          'Country', 'Student Count', 'Faculty/Student',
                          'International Student%', 'Female/Male']
        except:
            df = df[['rank', 'name', 'scores_overall',
                     'scores_teaching', 'scores_research', 'scores_citations',
                     'scores_industry_income', 'scores_international_outlook',
                     'location']]
            df.columns = ['Rank', 'University', 'TotalScore',
                          'Teaching', 'Research', 'Citation',
                          'Industry Income', 'International Outlook',
                          'Country']

        weightings = weightings[weightings.Subjects == rank].reset_index(drop=True)

        df = df.replace('-', 0, regex=True)

        df[['Teaching', 'Research', 'Citation', 'Industry Income', 'International Outlook']] = \
            df[['Teaching', 'Research', 'Citation', 'Industry Income', 'International Outlook']].astype('float32')

        df['Calculated Score'] = np.sum(
            np.multiply(df[['Teaching', 'Research', 'Citation', 'Industry Income', 'International Outlook']],
                        weightings[['Teaching', 'Research', 'Citation', 'Industry Income', 'International Outlook']]),
            1)

        df['Calculated Rank'] = df['Calculated Score'].rank(method='min', ascending=False)
        df['Subject'] = rank.replace('.', ' ')
        try:
            df['Subject'] = df['Subject'].str.replace('Subject', '')
        except:
            pass

        df['National Rank'] = df.groupby('Country')['Calculated Score'].rank(method='min', ascending=False)
        df['Top 100'] = df['Calculated Rank'] <= 100
        df['Year'] = year

        return df

def TIMES_rank(years=[2021], ranks=['World.University.Rankings','Classics.Ancient.History']):

    df_TIMES = pd.DataFrame()
    for year in tqdm(years):
        for rank in ranks:
            try:
                df_TIMES = pd.concat([df_TIMES,fetch_TIMES(rank=rank, year=year)], axis=0, ignore_index=True)
            except:
                print(year,rank)
    df_TIMES['Schema'] = 'TIMES'

    return df_TIMES

years = [2021]
ranks = ['World.University.Rankings']
df_TIMES = TIMES_rank(years, ranks)
print('\n === Times Higher Education ===')
display(df_TIMES.head(5))

In [None]:
print('\n === Times Higher Education Law ===')

years = [2020]
ranks = ['Subject.Law']
df_TIMES = TIMES_rank(years, ranks)
display(df_TIMES.head(5))


## ARWU World University Rankings
We extract [ARWU](http://www.shanghairanking.com) or the Shanghai Ranking via the table tag, as all required data are available these tags.
The tricky part would be to apply the weightings to each subject and adjustment to non-comprehensive university such as LSE.
Please refer to the [methodology](http://www.shanghairanking.com/ARWU-Methodology-2020.html).

In [34]:
#Define ARWU subject and worling rank scraping function
def fetch_ARWU_subject(ARWU_subject, subject= "Mathematics", year = "current"):
    """Scrape ARWU Subject Page
    """
    if (year == "current"):
        url1 = "http://www.shanghairanking.com/Shanghairanking-Subject-Rankings/"
    else:
        url1 = "http://www.shanghairanking.com/Shanghairanking-Subject"+ "-Rankings-"+ str(year)+"/"
    subject_page = ARWU_subject[ARWU_subject.Subject==subject].Page.reset_index(drop=True)[0]
    url_subject = url1+subject_page
    r = requests.get(url_subject)
    soup = BeautifulSoup(r.content, "html.parser")
    table = soup.find(lambda tag: tag.name == 'table' and tag.has_attr('id') and tag['id'] == "UniversityRanking")
    list_table = tableDataText(table)
    df = pd.DataFrame(list_table[1:])
    if  year == "current":
        df = df.drop(columns = 3)
    df.columns = ["Rank", "University", "Country", "TotalScore", "PUB",
                        "CNCI", "IC", "TOP", "AWARD"]
    # Retrieve countries
    countries = soup.find(lambda tag: tag.name == 'table' and tag.has_attr('id') and tag['id'] == "UniversityRanking")
    countries = [td['title'] for td in countries.find_all('img')]
    df.Country = countries

    weightings = pd.read_csv('arwuweighting.csv')
    weightings = weightings[weightings.Ranktype==subject].reset_index(drop=True)
    df = df.replace('NA', 0, regex=True)
    df[["PUB", "CNCI", "IC", "TOP", "AWARD"]] =  df[["PUB","CNCI", "IC", "TOP", "AWARD"]].astype('float32')
    df['Calculated Score'] = np.sum(np.multiply(df[["PUB","CNCI", "IC", "TOP", "AWARD"]],
                             weightings[["PUB",  "Normalized Citation",  "IC",  "Number Top10",  "Award"]]/100),1)

    df['Calculated Rank'] = df['Calculated Score'].rank(method ='min',ascending = False)
    df['Subject'] = subject
    df['National Rank'] = df.groupby('Country')['Calculated Score'].rank(method ='min',ascending = False)
    df['Top 100'] = df['Calculated Rank'] <= 100
    if year == "current":
        df['Year'] = datetime.now().date().year
    else:
        df['Year'] = year
    return df

def fetch_ARWU(year = None):
    """Scrape ARWU World University Ranking Page
    """
    url1 = "http://www.shanghairanking.com/ARWU"+ str(year) + ".html"

    r = requests.get(url1)
    soup = BeautifulSoup(r.content, "html.parser")
    table = soup.find(lambda tag: tag.name == 'table' and tag.has_attr('id') and tag['id'] == "UniversityRanking")
    list_table = tableDataText(table)
    df = pd.DataFrame(list_table[1:])
    df.columns = ["Rank", "University", "Country", "National Rank",
		"TotalScore", "Alumni", "Award", "HiCi", "N&S", "PUB",
		"PCP"]
    # Retrieve countries
    countries = soup.find(lambda tag: tag.name == 'table' and tag.has_attr('id') and tag['id'] == "UniversityRanking")
    countries = [td['src'] for td in countries.find_all('img')]
    df.Country = countries
    df.Country  = df.Country.str.replace('image/flag/','')
    df.Country  = df.Country.str.replace('.png','')


    df = df.replace('NA', 0, regex=True)
    df = df.replace('', -0.01, regex=True)
    df[[ "Alumni", "Award", "HiCi", "N&S", "PUB","PCP"]] =  df[[ "Alumni", "Award", "HiCi", "N&S", "PUB","PCP"]].astype('float32')

    df['Calculated Score'] = 1
    df['Calculated Score'][df["N&S"] == -0.01] = np.sum(np.multiply(df[df["N&S"] == -0.01][["Alumni", "Award", "HiCi", "PUB","PCP"]],[0.1, 0.2, 0.2, 0.2, 0.1]),1)*1.25
    df['Calculated Score'][df["N&S"] != -0.01] = np.sum(np.multiply(df[df["N&S"] != -0.01][["Alumni", "Award", "HiCi","N&S", "PUB","PCP"]],[0.1, 0.2, 0.2, 0.2,0.2, 0.1]),1)
    df['Calculated Score'] = df['Calculated Score'] * 100/np.max(df['Calculated Score'])

    df['Calculated Rank'] = df['Calculated Score'].rank(method ='min',ascending = False)
    df['Subject'] = 'World'
    df['National Rank'] = df.groupby('Country')['Calculated Score'].rank(method ='min',ascending = False)
    df['Top 100'] = df['Calculated Rank'] <= 100
    df['Year'] = year

    return df

def ARWU_rank(years=[2020], first_x_subjects = 2):
   '''first_x_subjects: get results for the first n subjects in ARWU subject list'''
   df_ARWU =  pd.DataFrame()
   for year in tqdm(years):
       try:
           df_ARWU = df_ARWU.append(fetch_ARWU(year = year))
       except:
           pass

   df_ARWU['Schema'] = 'ARWU'

   ARWU_subject =  weightings['subject_arwu']
   df_ARWU_subject = pd.DataFrame()
   for year in years:
       if year == datetime.now().year:
           year = "current"
       for subject in tqdm(ARWU_subject.Subject[0:first_x_subjects]):
           try:
               df_ARWU_subject = df_ARWU_subject.append(fetch_ARWU_subject(ARWU_subject, subject= subject, year = year))
           except:
               pass

   df_ARWU_subject['Schema'] = 'ARWU'
   df_ARWU_subject.columns = ['Rank', 'University', 'Country', 'TotalScore', 'PUB', 'CNCI', 'IC',
           'TOP', 'Award', 'Calculated Score', 'Calculated Rank', 'Subject',
           'National Rank', 'Top 100', 'Year', 'Schema']


   return df_ARWU, df_ARWU_subject

In [35]:
df_ARWU, df_ARWU_subject = ARWU_rank(years=[2020])

print('\n === ARWU ===')
display(df_ARWU.head(5))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|██████████| 1/1 [00:04<00:00,  4.88s/it]
100%|██████████| 2/2 [00:10<00:00,  5.26s/it]


Unnamed: 0,Rank,University,Country,National Rank,TotalScore,Alumni,Award,HiCi,N&S,PUB,PCP,Calculated Score,Calculated Rank,Subject,Top 100,Year,Schema
0,1,Harvard University,USA,1.0,100.0,100.0,100.0,100.0,100.0,100.0,79.300003,100.0,1.0,World,True,2020,ARWU
1,2,Stanford University,USA,2.0,74.2,43.799999,86.599998,71.099998,79.5,77.099998,53.799999,74.155007,2.0,World,True,2020,ARWU
2,3,University of Cambridge,UK,1.0,70.6,79.5,98.199997,50.0,57.099998,72.0,57.5,70.621872,3.0,World,True,2020,ARWU
3,4,Massachusetts Institute of Technology (MIT),USA,3.0,69.6,71.400002,85.099998,51.400002,69.800003,63.700001,69.699997,69.549679,4.0,World,True,2020,ARWU
4,5,"University of California, Berkeley",USA,4.0,65.8,64.900002,76.699997,53.299999,68.5,63.400002,56.0,65.832737,5.0,World,True,2020,ARWU
5,6,Princeton University,USA,5.0,61.1,59.0,97.900002,41.400002,49.900002,44.700001,71.800003,61.125295,6.0,World,True,2020,ARWU
6,7,Columbia University,USA,6.0,58.6,59.5,65.800003,48.0,54.799999,72.300003,32.900002,58.633719,7.0,World,True,2020,ARWU
7,8,California Institute of Technology,USA,7.0,57.7,50.700001,69.099998,36.400002,57.900002,43.900002,100.0,57.724906,8.0,World,True,2020,ARWU
8,9,University of Oxford,UK,2.0,57.2,48.900002,54.299999,46.400002,53.900002,78.900002,43.700001,57.142858,9.0,World,True,2020,ARWU
9,10,University of Chicago,USA,8.0,54.6,58.700001,88.199997,35.700001,41.299999,51.599998,43.0,54.661492,10.0,World,True,2020,ARWU


Unnamed: 0,Rank,University,Country,TotalScore,PUB,CNCI,IC,TOP,Award,Calculated Score,Calculated Rank,Subject,National Rank,Top 100,Year,Schema
0,1,Paris-Saclay University,France,362.9,87.0,73.599998,72.699997,87.800003,100.0,362.940001,1.0,Mathematics,1.0,True,2020,ARWU
1,2,Princeton University,United States,354.3,71.400002,88.900002,64.400002,100.0,81.199997,354.38,2.0,Mathematics,1.0,True,2020,ARWU
2,3,Sorbonne University,France,308.3,100.0,72.0,72.599998,95.599998,26.1,308.219999,3.0,Mathematics,2.0,True,2020,ARWU
3,4,Stanford University,United States,301.6,66.099998,90.800003,63.299999,89.400002,42.599998,301.560001,4.0,Mathematics,2.0,True,2020,ARWU
4,5,University of Cambridge,United Kingdom,301.4,63.200001,88.800003,77.099998,73.699997,60.299999,301.42,5.0,Mathematics,1.0,True,2020,ARWU
5,6,Massachusetts Institute of Technology (MIT),United States,293.5,80.0,85.199997,64.0,89.400002,26.1,293.499999,6.0,Mathematics,3.0,True,2020,ARWU
6,7,University of Oxford,United Kingdom,292.3,78.199997,75.599998,76.0,75.599998,47.700001,292.299995,7.0,Mathematics,2.0,True,2020,ARWU
7,8,New York University,United States,288.4,62.0,94.0,68.199997,58.599998,60.299999,288.539997,8.0,Mathematics,4.0,True,2020,ARWU
8,9,ETH Zurich,Switzerland,271.9,71.300003,78.400002,81.300003,63.200001,42.599998,271.760004,9.0,Mathematics,1.0,True,2020,ARWU
9,10,PSL University,France,269.8,72.199997,83.199997,72.800003,69.699997,30.200001,269.859992,10.0,Mathematics,3.0,True,2020,ARWU


In [37]:
print('\n === ARWU Subject Rankings ===')

display(df_ARWU_subject.head(5))



 === ARWU Subject Rankings ===


Unnamed: 0,Rank,University,Country,TotalScore,PUB,CNCI,IC,TOP,Award,Calculated Score,Calculated Rank,Subject,National Rank,Top 100,Year,Schema
0,1,Paris-Saclay University,France,362.9,87.0,73.599998,72.699997,87.800003,100.0,362.940001,1.0,Mathematics,1.0,True,2020,ARWU
1,2,Princeton University,United States,354.3,71.400002,88.900002,64.400002,100.0,81.199997,354.38,2.0,Mathematics,1.0,True,2020,ARWU
2,3,Sorbonne University,France,308.3,100.0,72.0,72.599998,95.599998,26.1,308.219999,3.0,Mathematics,2.0,True,2020,ARWU
3,4,Stanford University,United States,301.6,66.099998,90.800003,63.299999,89.400002,42.599998,301.560001,4.0,Mathematics,2.0,True,2020,ARWU
4,5,University of Cambridge,United Kingdom,301.4,63.200001,88.800003,77.099998,73.699997,60.299999,301.42,5.0,Mathematics,1.0,True,2020,ARWU


## QS World University Rankings
[QS](https://www.topuniversities.com) adopted a similar as the Times Higher Education. We therefore use ajax based extraction to
extract data for both world and subject rankings. The weightings for the calculated rank are extracted from
[QS intelligence unit](http://www.iu.qs.com).

In [None]:
#Define QS subject and world rank scraping function

def fetch_QS(rank= None, year = None, weightings = weightings['weightings_qs']):
    """Scrape QS Subject Page
    """
    with requests.Session() as session:
        if "world" in rank:
            url = "https://www.topuniversities.com/university-rankings/world-university-rankings/"+ str(year)
        else:
            url = "https://www.topuniversities.com/university-rankings/university-subject-rankings/"+ str(year) +"/"+ rank

        session.get(url)
        r = session.post(url,headers={"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
                                  'X-Requested-With': 'XMLHttpRequest'})
        soup = BeautifulSoup(r.content, "lxml")
        scripts = soup.find_all('script')
        for script in scripts:
            try:
                if "jQuery.extend" in script.contents[0] :
                    jsonStr = script.contents[0].split('jQuery.extend(Drupal.settings,')[1]
                    jsonStr = jsonStr.rsplit(');', 1)[0]
                    jsonObj = json.loads(jsonStr)
                    url2 = jsonObj['qs_rankings_datatables']['rank_indicators_url']
            except:
                pass

    req2 = urllib.request.Request(url2, headers={'User-Agent': 'Mozilla/5.0'})
    with urllib.request.urlopen(req2) as url:
        indicator_data = json.loads(url.read().decode())

    columns  = pd.DataFrame(indicator_data['columns'])
    rank_data =  pd.DataFrame(indicator_data['data'])

    for index,row in rank_data.iterrows():
        rank_data.iloc[index] = [strip_tags(item) for item in row]
    if "world" in rank:
        rank_data.columns = ['region', 'Country', 'Rank', 'overall_rank_dis', 'University',
           'TotalScore', 'stars', 'Academic', '3791742_rank_d', '3791742_rank',
           'Employer', '3791741_rank_d', '3791741_rank', 'Faculty Student',
           '3791740_rank_d', '3791740_rank', 'Citation', '3791737_rank_d',
           '3791737_rank', 'International Faculty', '3791739_rank_d', '3791739_rank', 'International Student',
           '3791738_rank_d', '3791738_rank', '', '_rank_d', '_rank']
        rank_data = rank_data[['Country', 'Rank',  'University', 'TotalScore',  'Academic',  'Employer','Faculty Student',
        'Citation', 'International Faculty', 'International Student']]
    else:
        try:
            rank_data.columns = ['region', 'Country', 'Rank', 'overall_rank_dis', 'University',
                                 'TotalScore', 'stars', 'Academic', '4280115_rank_d', '4280115_rank',
                                 'Employer', '4280112_rank_d', '4280112_rank', 'h-index',
                                 '4280113_rank_d', '4280113_rank', 'Citation', '4280114_rank_d',
                                 '4280114_rank']
        except:
            if rank == 'performing-arts':
                rank_data.columns = ['region', 'Country', 'Rank', 'overall_rank_dis', 'University',
                                     'TotalScore', 'stars', 'Academic', '4280115_rank_d', '4280115_rank',
                                     'Employer', '4280112_rank_d', '4280112_rank']
                rank_data['h-index'] = 0
                rank_data['Citation'] = 0
            elif rank == 'classics-ancient-history':
                rank_data.columns = ['region', 'Country', 'Rank', 'overall_rank_dis', 'University',
                                     'TotalScore', 'stars', 'Academic', '4280115_rank_d', '4280115_rank',
                                     'Employer', '4280112_rank_d', '4280112_rank']
                rank_data['h-index'] = 0
                rank_data['Citation'] = 0
            elif rank == 'english-language-literature':
                rank_data.columns = ['region', 'Country', 'Rank', 'overall_rank_dis', 'University',
                                     'TotalScore', 'stars', 'Academic', '4280115_rank_d', '4280115_rank',
                                     'Employer', '4280112_rank_d', '4280112_rank', 'Citation', '4280114_rank_d',
                                     '4280114_rank']
                rank_data['h-index'] = 0
            elif rank == 'hospitality':
                rank_data.columns = ['region', 'Country', 'Rank', 'overall_rank_dis', 'University',
                                     'TotalScore', 'stars', 'Academic', '4280115_rank_d', '4280115_rank',
                                     'Employer', '4280112_rank_d', '4280112_rank', 'Citation', '4280114_rank_d',
                                     '4280114_rank']
                rank_data['h-index'] = 0
            else:
                pass

        rank_data = rank_data[['Country', 'Rank', 'University', 'TotalScore', 'Academic', 'Employer', 'h-index', 'Citation']]

    weightings.Subject = weightings.Subject.str.lower()
    weightings.Subject = weightings.Subject.str.replace('.','-')
    weightings = weightings[weightings.Subject==rank].reset_index(drop=True)
    df = rank_data.copy()

    df = df.replace('', 0, regex=True)

    if "world" in rank:
        df[[ 'Academic',  'Employer','Faculty Student',
        'Citation', 'International Faculty', 'International Student']] = \
            df[[ 'Academic',  'Employer','Faculty Student',
        'Citation', 'International Faculty', 'International Student']].astype('float32')

        df['Calculated Score'] = np.sum(np.multiply(df[['Academic',  'Employer','Faculty Student', 'Citation', 'International Faculty', 'International Student']],
                                 weightings[['Academic',  'Employer','Faculty Student', 'Citation', 'International Faculty', 'International Student']]),1)
        df['Calculated Score'] = 100 / np.max(df['Calculated Score']) * df['Calculated Score']
    else:
        df[['Academic', 'Employer', 'h-index', 'Citation']] =  df[['Academic', 'Employer', 'h-index', 'Citation']].astype('float32')
        df['Calculated Score'] = np.sum(np.multiply(df[['Academic', 'Employer', 'h-index', 'Citation']],
                                                    weightings[['Academic', 'Employer', 'h-index', 'Citation']]), 1)


    df['Calculated Rank'] = df['Calculated Score'].rank(method ='min',ascending = False)
    df['Subject'] =  rank.replace('.',' ')

    df['National Rank'] = df.groupby('Country')['Calculated Score'].rank(method ='min',ascending = False)
    df['Top 100'] = df['Calculated Rank'] <= 100
    df['Year'] = year-1

    return df

def QS_rank(years=None, ranks=None):

    df_QS = pd.DataFrame()
    for year in tqdm(years):
        for rank in ranks:
            try:
                df_QS = df_QS.append(fetch_QS(rank=rank, year=year))
            except:
                print(year, rank)
    df_QS['Schema'] = 'QS'
    return df_QS

In [None]:
years = [2020]
df_QS = QS_rank(years = years, ranks = ranks)
ranks=['world-university-rankings','classics-ancient-history']


In [None]:
print('\n === QS World and Subject Rankings ===')
display(df_QS[df_QS.Subject== 'world-university-rankings'].head(5))

display(df_QS[df_QS.Subject== 'classics-ancient-history'].head(5))


