In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

In [2]:
url = "https://cwur.org/2021-22.php"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
}

response = requests.get(url, headers=headers)
print(response.text)
# Gets an http request to the provided URL.

<!DOCTYPE html>
<html lang="en">
<head>

<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->

<meta name="description" content="Discover the world's top universities and best colleges for 2021-2022. Explore the Global 2000 List by the Center for World University Rankings (CWUR).">

<meta name="keywords" content="ranking, rankings, university, universities, college, colleges, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, world, top, best, global, Ranking universitario mundial, Classement mondial des universités , Weltweites Universitätsranking, Zentrum für weltweite Universitätsrankings , דירוג האוניברסיטאות העולמי, המרכז לדירוג האוניברסיטאות העולמי, 세계 대학순위, が世界の大学トップ, 世界大學排名中心, 세계대학랭킹센터,世界大学ランキングセンター, Ranking mundial universitário, Рейтинг университе

In [3]:
html = response.content
soup = BeautifulSoup(html, 'html.parser')
countries = soup.find_all("td") 
countries
# Looks for all the tags named as "td".

[<td>1</td>,
 <td><a href="2021-22/Harvard-University.php">Harvard University</a></td>,
 <td><a href="2021-22/country/usa.php">USA</a></td>,
 <td>1</td>,
 <td>1</td>,
 <td>1</td>,
 <td>1</td>,
 <td>1</td>,
 <td>100</td>,
 <td>2</td>,
 <td><a href="2021-22/Massachusetts-Institute-of-Technology.php">Massachusetts Institute of Technology</a></td>,
 <td><a href="2021-22/country/usa.php">USA</a></td>,
 <td>2</td>,
 <td>4</td>,
 <td>12</td>,
 <td>2</td>,
 <td>8</td>,
 <td>96.7</td>,
 <td>3</td>,
 <td><a href="2021-22/Stanford-University.php">Stanford University</a></td>,
 <td><a href="2021-22/country/usa.php">USA</a></td>,
 <td>3</td>,
 <td>10</td>,
 <td>4</td>,
 <td>3</td>,
 <td>2</td>,
 <td>95.1</td>,
 <td>4</td>,
 <td><a href="2021-22/University-of-Cambridge.php">University of Cambridge</a></td>,
 <td><a href="2021-22/country/united-kingdom.php">United Kingdom</a></td>,
 <td>1</td>,
 <td>3</td>,
 <td>25</td>,
 <td>4</td>,
 <td>10</td>,
 <td>94.1</td>,
 <td>5</td>,
 <td><a href="2021-22/Un

In [279]:
def get_info_uni (soup):
    countries = soup.find_all("td") 
    return [i.getText().strip() for i in countries]
# Defines a function that allows to obtain only the info in a class related to attributes following the "td" tag.

In [280]:
uni_countries = get_info_uni(soup)
uni_countries
# Prints a list of information after calling the defined get_info_soup.

['1',
 'Harvard University',
 'USA',
 '1',
 '1',
 '1',
 '1',
 '1',
 '100',
 '2',
 'Massachusetts Institute of Technology',
 'USA',
 '2',
 '4',
 '12',
 '2',
 '8',
 '96.7',
 '3',
 'Stanford University',
 'USA',
 '3',
 '10',
 '4',
 '3',
 '2',
 '95.1',
 '4',
 'University of Cambridge',
 'United Kingdom',
 '1',
 '3',
 '25',
 '4',
 '10',
 '94.1',
 '5',
 'University of Oxford',
 'United Kingdom',
 '2',
 '7',
 '27',
 '9',
 '4',
 '93.3',
 '6',
 'Princeton University',
 'USA',
 '4',
 '5',
 '15',
 '7',
 '70',
 '92.6',
 '7',
 'Columbia University',
 'USA',
 '5',
 '11',
 '14',
 '10',
 '15',
 '92.0',
 '8',
 'University of Chicago',
 'USA',
 '6',
 '8',
 '16',
 '27',
 '22',
 '91.5',
 '9',
 'University of Pennsylvania',
 'USA',
 '7',
 '14',
 '10',
 '42',
 '12',
 '91.1',
 '10',
 'Yale University',
 'USA',
 '8',
 '6',
 '36',
 '13',
 '20',
 '90.7',
 '11',
 'California Institute of Technology',
 'USA',
 '9',
 '2',
 '111',
 '8',
 '75',
 '90.4',
 '12',
 'University of California, Berkeley',
 'USA',
 '10',
 '

In [281]:
list_of_dicts = []
def create_custom_dict(sublist, keys):
    return {keys[i]: sublist[i] for i in range(len(sublist))}
# Defines a function to create a dictionary.

In [282]:
key_names = ["World Rank", "Institution", "Country", "National Rank", "Quality of Education Rank", "Alumni Employment Rank", "Quality of Faculty Rank", "Research Performance Rank", "Score"]
# Specificies the info to be added to the dictionary in the form of a list. 

In [283]:
for i in range(0, len(uni_countries), 9):
    sublist = uni_countries[i:i+9]
    custom_dict = create_custom_dict(sublist, key_names)
    list_of_dicts.append(custom_dict)   
# Loops to create dictionaries every 9 items.

print(list_of_dicts)
# Prints the list of dictionaries.

[{'World Rank': '1', 'Institution': 'Harvard University', 'Country': 'USA', 'National Rank': '1', 'Quality of Education Rank': '1', 'Alumni Employment Rank': '1', 'Quality of Faculty Rank': '1', 'Research Performance Rank': '1', 'Score': '100'}, {'World Rank': '2', 'Institution': 'Massachusetts Institute of Technology', 'Country': 'USA', 'National Rank': '2', 'Quality of Education Rank': '4', 'Alumni Employment Rank': '12', 'Quality of Faculty Rank': '2', 'Research Performance Rank': '8', 'Score': '96.7'}, {'World Rank': '3', 'Institution': 'Stanford University', 'Country': 'USA', 'National Rank': '3', 'Quality of Education Rank': '10', 'Alumni Employment Rank': '4', 'Quality of Faculty Rank': '3', 'Research Performance Rank': '2', 'Score': '95.1'}, {'World Rank': '4', 'Institution': 'University of Cambridge', 'Country': 'United Kingdom', 'National Rank': '1', 'Quality of Education Rank': '3', 'Alumni Employment Rank': '25', 'Quality of Faculty Rank': '4', 'Research Performance Rank': 

In [284]:
univer_countries = pd.DataFrame(list_of_dicts)
univer_countries
# Transforms the data into a dataframe.

Unnamed: 0,World Rank,Institution,Country,National Rank,Quality of Education Rank,Alumni Employment Rank,Quality of Faculty Rank,Research Performance Rank,Score
0,1,Harvard University,USA,1,1,1,1,1,100
1,2,Massachusetts Institute of Technology,USA,2,4,12,2,8,96.7
2,3,Stanford University,USA,3,10,4,3,2,95.1
3,4,University of Cambridge,United Kingdom,1,3,25,4,10,94.1
4,5,University of Oxford,United Kingdom,2,7,27,9,4,93.3
...,...,...,...,...,...,...,...,...,...
1995,1996,Santa Catarina State University,Brazil,56,-,936,-,1938,65.8
1996,1997,Yancheng Institute of Technology,China,275,-,-,-,1920,65.7
1997,1998,Xi'an University of Science and Technology,China,276,-,994,-,1937,65.7
1998,1999,CEU San Pablo University,Spain,53,-,568,-,1992,65.7


In [285]:
value_counts = univer_countries['Country'].value_counts()
value_counts
# Checks the number of times a value in the column "Country" is repeated.

Country
USA                347
China              277
Japan              124
United Kingdom      95
France              79
                  ... 
Malawi               1
Kuwait               1
Cameroon             1
Azerbaijan           1
North Macedonia      1
Name: count, Length: 95, dtype: int64

In [286]:
univer_countries = univer_countries.merge(value_counts, left_on='Country', right_index=True, how='left', suffixes=('', 'Count of countries'))
univer_countries
# Adds a new column with data obtained in "value_counts".

Unnamed: 0,World Rank,Institution,Country,National Rank,Quality of Education Rank,Alumni Employment Rank,Quality of Faculty Rank,Research Performance Rank,Score,count
0,1,Harvard University,USA,1,1,1,1,1,100,347
1,2,Massachusetts Institute of Technology,USA,2,4,12,2,8,96.7,347
2,3,Stanford University,USA,3,10,4,3,2,95.1,347
3,4,University of Cambridge,United Kingdom,1,3,25,4,10,94.1,95
4,5,University of Oxford,United Kingdom,2,7,27,9,4,93.3,95
...,...,...,...,...,...,...,...,...,...,...
1995,1996,Santa Catarina State University,Brazil,56,-,936,-,1938,65.8,56
1996,1997,Yancheng Institute of Technology,China,275,-,-,-,1920,65.7,277
1997,1998,Xi'an University of Science and Technology,China,276,-,994,-,1937,65.7,277
1998,1999,CEU San Pablo University,Spain,53,-,568,-,1992,65.7,53


In [287]:
univer_countries = univer_countries.rename(columns={'count': 'Count of Top Universities'})
univer_countries
# Renames the column as "Count of Top Universities".

Unnamed: 0,World Rank,Institution,Country,National Rank,Quality of Education Rank,Alumni Employment Rank,Quality of Faculty Rank,Research Performance Rank,Score,Count of Top Universities
0,1,Harvard University,USA,1,1,1,1,1,100,347
1,2,Massachusetts Institute of Technology,USA,2,4,12,2,8,96.7,347
2,3,Stanford University,USA,3,10,4,3,2,95.1,347
3,4,University of Cambridge,United Kingdom,1,3,25,4,10,94.1,95
4,5,University of Oxford,United Kingdom,2,7,27,9,4,93.3,95
...,...,...,...,...,...,...,...,...,...,...
1995,1996,Santa Catarina State University,Brazil,56,-,936,-,1938,65.8,56
1996,1997,Yancheng Institute of Technology,China,275,-,-,-,1920,65.7,277
1997,1998,Xi'an University of Science and Technology,China,276,-,994,-,1937,65.7,277
1998,1999,CEU San Pablo University,Spain,53,-,568,-,1992,65.7,53


In [288]:
columns_to_drop = ['World Rank', 'Institution', 'National Rank', 'Quality of Education Rank', 'Alumni Employment Rank', 'Quality of Faculty Rank', 'Research Performance Rank', 'Score']
univer_countries = univer_countries.drop(columns=columns_to_drop)
univer_countries
# Drops columns to leave only columns "Country" and "Count of Top Universities".

Unnamed: 0,Country,Count of Top Universities
0,USA,347
1,USA,347
2,USA,347
3,United Kingdom,95
4,United Kingdom,95
...,...,...
1995,Brazil,56
1996,China,277
1997,China,277
1998,Spain,53


In [308]:
univer_countries_ = univer_countries[~univer_countries['Country'].duplicated(keep='first')]
univer_countries_
# Deletes duplicated values.

Unnamed: 0,Country,Count of Top Universities
0,USA,347
3,United Kingdom,95
12,Japan,124
20,France,79
23,Canada,42
...,...,...
1851,Northern Cyprus,1
1871,Tanzania,1
1906,Senegal,1
1921,Kazakhstan,1


In [309]:
univer_countries_.to_csv('univer_countries.csv', index=False)
# Exports to a csv file.