# [csrankings](https://csrankings.org/) scrapper

### Author : [Maryam Saeidmehr](https://maryamsaeedmehr.github.io)
### License: MIT
### Des. 2022

---

In [1]:
!pip install BeautifulSoup4 --quiet
!pip install -U selenium --quiet

In [1]:
from bs4 import BeautifulSoup as BS
from selenium.webdriver.common.by import By
import numpy as np
import pandas as pd
import re
import time
from selenium import webdriver

In [3]:
USA_all = "https://csrankings.org/#/index?all&us"
# general fields
USA_ai  = "https://csrankings.org/#/index?ai&vision&mlmining&nlp&inforet&us"
USA_sys = "https://csrankings.org/#/index?arch&comm&sec&mod&hpc&mobile&metrics&ops&plan&soft&da&bed&us"
USA_the = "https://csrankings.org/#/index?act&crypt&log&us"
USA_oth = "https://csrankings.org/#/index?graph&chi&robotics&bio&visualization&ecom&us"
# specific fields
USA_vis = "https://csrankings.org/#/index?vision&us"
USA_rob = "https://csrankings.org/#/index?robotics&us"
USA_nlp = "https://csrankings.org/#/index?nlp&us"
USA_bio = "https://csrankings.org/#/index?bio&us"


CND_all = "https://csrankings.org/#/index?all&ca"
# general fields
CND_ai  = "https://csrankings.org/#/index?ai&vision&mlmining&nlp&inforet&ca"
CND_sys = "https://csrankings.org/#/index?arch&comm&sec&mod&hpc&mobile&metrics&ops&plan&soft&da&bed&ca"
CND_the = "https://csrankings.org/#/index?act&crypt&log&ca"
CND_oth = "https://csrankings.org/#/index?graph&chi&robotics&bio&visualization&ecom&ca"
# specific fields
CND_vis = "https://csrankings.org/#/index?vision&ca"
CND_rob = "https://csrankings.org/#/index?robotics&ca"
CND_nlp = "https://csrankings.org/#/index?nlp&ca"
CND_bio = "https://csrankings.org/#/index?bio&ca"


In [4]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

In [5]:
def scrape_university_list(url, country_name):
    """
    @params:url: csranking url with defined filters
    @params:country_name: string; e.g. 'USA', 'CDN', etc.
    
    @return:university_list: numpy array (1xN)
    """
    print(bcolors.HEADER+"INFO! start to fetch data from "+url+bcolors.ENDC)
    driver = webdriver.Chrome('chromedriver',options=chrome_options)
    driver.get(url)
    time.sleep(2)
    scroll_pause_time = 1
    table_height = 700
    list_height = driver.execute_script("return document.getElementsByClassName('table-responsive')[27].scrollHeight;")
    i = 1

    while True:
        driver.execute_script("document.getElementsByClassName('table-responsive')[27].scrollTo(0, {table_height}*{i});".format(table_height=table_height, i=i))  
        i += 1
        time.sleep(scroll_pause_time)
        if i*table_height > list_height:
            break 

    soup = BS(driver.page_source, "html.parser")
    driver.quit()
    print(bcolors.OKGREEN+"SUCCESS! fetching data completed successfully"+bcolors.ENDC)
    
    print(bcolors.HEADER+"INFO! start to scrape institutions of country_field ("+country_name+")"+bcolors.ENDC)
    table_content = soup.find(id="ranking")
    TDs = table_content.findAll("td")
    university_td = table_content.findAll("span", {"onclick": re.compile(r"csr.*"), "id": False})
    university_list = []
    for i in range(len(university_td)):
        university_list.extend(university_td[i].contents)
        
    university_list = np.array(university_list, dtype=str)
    span_hover_td = table_content.findAll("span", {"class": "hovertip", "id": re.compile(r".*-widget"), "title": False})
    rank = np.array([td.findPrevious().findPrevious().get_text(strip=True) for td in span_hover_td], dtype=int)
    rank_uni_df = np.concatenate((rank.reshape(rank.shape[0], 1), university_list.reshape(university_list.shape[0], 1)), axis=1)
    pd.DataFrame(rank_uni_df).to_csv('Institutions_{}.csv'.format(country_name), index = False, header = ['rank','institution'])
    print(bcolors.OKGREEN+"SUCCESS! Institutions_"+country_name+".csv file is created successfully"+bcolors.ENDC)

In [6]:
scrape_university_list(url=USA_all, country_name='USA_all')
# general fields
scrape_university_list(url=USA_ai , country_name='USA_ai' )
scrape_university_list(url=USA_sys, country_name='USA_sys')
scrape_university_list(url=USA_the, country_name='USA_the')
scrape_university_list(url=USA_oth, country_name='USA_oth')
# specific fields
scrape_university_list(url=USA_vis, country_name='USA_vis' )
scrape_university_list(url=USA_rob, country_name='USA_rob')
scrape_university_list(url=USA_nlp, country_name='USA_nlp')
scrape_university_list(url=USA_bio, country_name='USA_bio')

[95mINFO! start to fetch data from https://csrankings.org/#/index?all&us[0m
[92mSUCCESS! fetching data completed successfully[0m
[95mINFO! start to scrape institutions of country_field (USA_all)[0m
[92mSUCCESS! Institutions_USA_all.csv file is created successfully[0m
[95mINFO! start to fetch data from https://csrankings.org/#/index?ai&vision&mlmining&nlp&inforet&us[0m
[92mSUCCESS! fetching data completed successfully[0m
[95mINFO! start to scrape institutions of country_field (USA_ai)[0m
[92mSUCCESS! Institutions_USA_ai.csv file is created successfully[0m
[95mINFO! start to fetch data from https://csrankings.org/#/index?arch&comm&sec&mod&hpc&mobile&metrics&ops&plan&soft&da&bed&us[0m
[92mSUCCESS! fetching data completed successfully[0m
[95mINFO! start to scrape institutions of country_field (USA_sys)[0m
[92mSUCCESS! Institutions_USA_sys.csv file is created successfully[0m
[95mINFO! start to fetch data from https://csrankings.org/#/index?act&crypt&log&us[0m
[92m

In [7]:
scrape_university_list(url=CND_all, country_name='CND_all')
# general fields
scrape_university_list(url=CND_ai , country_name='CND_ai' )
scrape_university_list(url=CND_sys, country_name='CND_sys')
scrape_university_list(url=CND_the, country_name='CND_the')
scrape_university_list(url=CND_oth, country_name='CND_oth')
# specific fields
scrape_university_list(url=CND_vis, country_name='CND_vis' )
scrape_university_list(url=CND_rob, country_name='CND_rob')
scrape_university_list(url=CND_nlp, country_name='CND_nlp')
scrape_university_list(url=CND_bio, country_name='CND_bio')

[95mINFO! start to fetch data from https://csrankings.org/#/index?all&ca[0m
[92mSUCCESS! fetching data completed successfully[0m
[95mINFO! start to scrape institutions of country_field (CND_all)[0m
[92mSUCCESS! Institutions_CND_all.csv file is created successfully[0m
[95mINFO! start to fetch data from https://csrankings.org/#/index?ai&vision&mlmining&nlp&inforet&ca[0m
[92mSUCCESS! fetching data completed successfully[0m
[95mINFO! start to scrape institutions of country_field (CND_ai)[0m
[92mSUCCESS! Institutions_CND_ai.csv file is created successfully[0m
[95mINFO! start to fetch data from https://csrankings.org/#/index?arch&comm&sec&mod&hpc&mobile&metrics&ops&plan&soft&da&bed&ca[0m
[92mSUCCESS! fetching data completed successfully[0m
[95mINFO! start to scrape institutions of country_field (CND_sys)[0m
[92mSUCCESS! Institutions_CND_sys.csv file is created successfully[0m
[95mINFO! start to fetch data from https://csrankings.org/#/index?act&crypt&log&ca[0m
[92m

In [2]:
us_all = pd.read_csv('Institutions_USA_all.csv')
us_ai  = pd.read_csv('Institutions_USA_ai.csv')
us_sys = pd.read_csv('Institutions_USA_sys.csv')
us_the = pd.read_csv('Institutions_USA_the.csv')
us_oth = pd.read_csv('Institutions_USA_oth.csv')
us_vis = pd.read_csv('Institutions_USA_vis.csv')
us_rob = pd.read_csv('Institutions_USA_rob.csv')
us_nlp = pd.read_csv('Institutions_USA_nlp.csv')
us_bio = pd.read_csv('Institutions_USA_bio.csv')

ca_all = pd.read_csv('Institutions_CND_all.csv')
ca_ai  = pd.read_csv('Institutions_CND_ai.csv')
ca_sys = pd.read_csv('Institutions_CND_sys.csv')
ca_the = pd.read_csv('Institutions_CND_the.csv')
ca_oth = pd.read_csv('Institutions_CND_oth.csv')
ca_vis = pd.read_csv('Institutions_CND_vis.csv')
ca_rob = pd.read_csv('Institutions_CND_rob.csv')
ca_nlp = pd.read_csv('Institutions_CND_nlp.csv')
ca_bio = pd.read_csv('Institutions_CND_bio.csv')

In [3]:
us_all_ai = pd.merge(us_all, us_ai, on='institution', how="outer", suffixes=("", "_AI"))
us_all_ai_sys = pd.merge(us_all_ai, us_sys, on='institution', how="outer", suffixes=("", "_Systems"))
us_all_ai_sys_the = pd.merge(us_all_ai_sys, us_the, on='institution', how="outer", suffixes=("", "_Theory"))
us_all_ai_sys_the_oth = pd.merge(us_all_ai_sys_the, us_oth, on='institution', how="outer", suffixes=("", "_Interdisciplinary_Areas"))
us_all_ai_sys_the_oth_vis = pd.merge(us_all_ai_sys_the_oth, us_vis, on='institution', how="outer", suffixes=("", "_Computer_Vision"))
us_all_ai_sys_the_oth_vis_rob = pd.merge(us_all_ai_sys_the_oth_vis, us_rob, on='institution', how="outer", suffixes=("", "_Robotics"))
us_all_ai_sys_the_oth_vis_rob_nlp = pd.merge(us_all_ai_sys_the_oth_vis_rob, us_nlp, on='institution', how="outer", suffixes=("", "_Natural_language_processing"))
us_all_ai_sys_the_oth_vis_rob_nlp_bio = pd.merge(us_all_ai_sys_the_oth_vis_rob_nlp, us_bio, on='institution', how="outer", suffixes=("", "_Computaional_bio_&_bioinformatics"))
us_all_ai_sys_the_oth_vis_rob_nlp_bio.head()

Unnamed: 0,rank,institution,rank_AI,rank_Systems,rank_Theory,rank_Interdisciplinary_Areas,rank_Computer_Vision,rank_Robotics,rank_Natural_language_processing,rank_Computaional_bio_&_bioinformatics
0,1,Carnegie Mellon University,1.0,2.0,1.0,1.0,1.0,2.0,2.0,1.0
1,2,Univ. of Illinois at Urbana-Champaign,2.0,1.0,4.0,10.0,5.0,7.0,11.0,5.0
2,3,Massachusetts Institute of Technology,8.0,5.0,2.0,5.0,6.0,3.0,19.0,10.0
3,4,Univ. of California - San Diego,6.0,4.0,9.0,8.0,3.0,9.0,12.0,3.0
4,5,University of Michigan,5.0,3.0,13.0,12.0,12.0,18.0,8.0,


In [4]:
ca_all_ai = pd.merge(ca_all, ca_ai, on='institution', how="outer", suffixes=("", "_AI"))
ca_all_ai_sys = pd.merge(ca_all_ai, ca_sys, on='institution', how="outer", suffixes=("", "_Systems"))
ca_all_ai_sys_the = pd.merge(ca_all_ai_sys, ca_the, on='institution', how="outer", suffixes=("", "_Theory"))
ca_all_ai_sys_the_oth = pd.merge(ca_all_ai_sys_the, ca_oth, on='institution', how="outer", suffixes=("", "_Interdisciplinary_Areas"))
ca_all_ai_sys_the_oth_vis = pd.merge(ca_all_ai_sys_the_oth, ca_vis, on='institution', how="outer", suffixes=("", "_Computer_Vision"))
ca_all_ai_sys_the_oth_vis_rob = pd.merge(ca_all_ai_sys_the_oth_vis, ca_rob, on='institution', how="outer", suffixes=("", "_Robotics"))
ca_all_ai_sys_the_oth_vis_rob_nlp = pd.merge(ca_all_ai_sys_the_oth_vis_rob, ca_nlp, on='institution', how="outer", suffixes=("", "_Natural_language_processing"))
ca_all_ai_sys_the_oth_vis_rob_nlp_bio = pd.merge(ca_all_ai_sys_the_oth_vis_rob_nlp, ca_bio, on='institution', how="outer", suffixes=("", "_Computaional_bio_&_bioinformatics"))
ca_all_ai_sys_the_oth_vis_rob_nlp_bio.head()

Unnamed: 0,rank,institution,rank_AI,rank_Systems,rank_Theory,rank_Interdisciplinary_Areas,rank_Computer_Vision,rank_Robotics,rank_Natural_language_processing,rank_Computaional_bio_&_bioinformatics
0,1,University of Toronto,1.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0
1,2,University of Waterloo,2.0,1.0,1.0,4.0,5.0,8.0,4.0,7.0
2,3,University of British Columbia,4.0,3.0,6.0,2.0,4.0,7.0,5.0,
3,4,Simon Fraser University,6.0,4.0,4.0,3.0,2.0,4.0,7.0,5.0
4,5,McGill University,7.0,5.0,3.0,5.0,14.0,2.0,1.0,1.0


In [5]:
pd.DataFrame(us_all_ai_sys_the_oth_vis_rob_nlp_bio).to_csv('us_csranking.csv', index = False)

pd.DataFrame(ca_all_ai_sys_the_oth_vis_rob_nlp_bio).to_csv('ca_csranking.csv', index = False)