### Imports

In [151]:
# Import libraries
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import math
from difflib import SequenceMatcher

pd.options.mode.chained_assignment = None
idx = pd.IndexSlice

---

### Scraping the data

#### Top universities

In [14]:
top_universities_url = 'https://www.topuniversities.com'
data_url = 'https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt'

ranking = requests.get(data_url).text
ranking = json.loads(ranking)['data']
tu_ranking_dataframe = pd.io.json.json_normalize(ranking)
tu_ranking_dataframe.head()

Unnamed: 0,cc,core_id,country,guide,logo,nid,rank_display,region,score,stars,title,url
0,US,410,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",294850,1,North America,100.0,6,Massachusetts Institute of Technology (MIT),/universities/massachusetts-institute-technolo...
1,US,573,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",297282,2,North America,98.7,5,Stanford University,/universities/stanford-university
2,US,253,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",294270,3,North America,98.4,5,Harvard University,/universities/harvard-university
3,US,94,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",294562,4,North America,97.7,5,California Institute of Technology (Caltech),/universities/california-institute-technology-...
4,GB,95,United Kingdom,"<a href=""/where-to-study/europe/united-kingdom...","<img src=""https://www.topuniversities.com/site...",294561,5,Europe,95.6,5,University of Cambridge,/universities/university-cambridge


In [15]:
fields_to_keep = ['title', 'rank_display', 'country', 'region', 'url']
tu_ranking_dataframe = tu_ranking_dataframe[fields_to_keep].set_index('rank_display')

# Keep the 200 first university
tu_ranking_dataframe = tu_ranking_dataframe[:200]
tu_ranking_dataframe.head()

Unnamed: 0_level_0,title,country,region,url
rank_display,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Massachusetts Institute of Technology (MIT),United States,North America,/universities/massachusetts-institute-technolo...
2,Stanford University,United States,North America,/universities/stanford-university
3,Harvard University,United States,North America,/universities/harvard-university
4,California Institute of Technology (Caltech),United States,North America,/universities/california-institute-technology-...
5,University of Cambridge,United Kingdom,Europe,/universities/university-cambridge


In [17]:
# Inspect html of universities link (here mit)
mit_page = requests.get(top_universities_url + tu_ranking_dataframe.loc[:, 'url'][0]).text
mit_page = BeautifulSoup(mit_page, 'html.parser')

In [18]:
def get_total_faculty(uni_page):
    try:
        faculty_html = uni_page.find('div', class_='faculty-main')
        total_faculty_html = faculty_html.find('div', class_='total faculty')
        total_faculty_html = total_faculty_html.find('div', class_='number')
        total_faculty_num = int(total_faculty_html.contents[0].strip().replace(',', ''))
    except: 
        total_faculty_num = np.NaN
    return total_faculty_num

def get_inter_faculty(uni_page):
    try:
        faculty_html = uni_page.find('div', class_='faculty-main')
        inter_faculty_html = faculty_html.find('div', class_='inter faculty')
        inter_faculty_html = inter_faculty_html.find('div', class_='number')
        inter_faculty_num = int(inter_faculty_html.contents[0].strip().replace(',', ''))  
    except: 
        inter_faculty_num = np.NaN
    return inter_faculty_num

def get_total_student(uni_page):
    try:
        total_student_html = uni_page.find('div', class_='students-main')
        total_student_html = total_student_html.find('div', class_='number')
        total_student_num = int(total_student_html.contents[0].strip().replace(',', ''))
    except:
        total_student_num = np.NaN
    return total_student_num

def get_inter_student(uni_page):
    try:
        inter_student_html = uni_page.find('div', class_='int-students-main')
        inter_student_html = inter_student_html.find('div', class_='number')
        inter_student_num = int(inter_student_html.contents[0].strip().replace(',', ''))
    except:
        inter_student_num = np.NaN
    return inter_student_num

In [19]:
# Fetch all html pages
tu_ranking_dataframe.loc[:, 'html'] = tu_ranking_dataframe.loc[:, 'url'].map(lambda url: BeautifulSoup(requests.get(top_universities_url + url).text, 'html.parser'))


In [20]:
tu_ranking_dataframe.loc[:, 'Total faculty member'] = tu_ranking_dataframe.loc[:, 'html'].map(lambda html: get_total_faculty(html))
tu_ranking_dataframe.loc[:, 'International faculty member'] = tu_ranking_dataframe.loc[:, 'html'].map(lambda html: get_inter_faculty(html))
tu_ranking_dataframe.loc[:, 'Total student'] = tu_ranking_dataframe.loc[:, 'html'].map(lambda html: get_total_student(html))
tu_ranking_dataframe.loc[:, 'International student'] = tu_ranking_dataframe.loc[:, 'html'].map(lambda html: get_inter_student(html))

In [21]:
tu_ranking_dataframe.drop(['url', 'html'], axis=1, inplace=True)
tu_ranking_dataframe.head()

Unnamed: 0_level_0,title,country,region,Total faculty member,International faculty member,Total student,International student
rank_display,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Massachusetts Institute of Technology (MIT),United States,North America,2982.0,1679.0,11067.0,3717.0
2,Stanford University,United States,North America,4285.0,2042.0,15878.0,3611.0
3,Harvard University,United States,North America,4350.0,1311.0,22429.0,5266.0
4,California Institute of Technology (Caltech),United States,North America,953.0,350.0,2255.0,647.0
5,University of Cambridge,United Kingdom,Europe,5490.0,2278.0,18770.0,6699.0


In [22]:
# As we can see on the output below, 2 universities have missing values. 
tu_ranking_dataframe[tu_ranking_dataframe.isnull().any(axis=1)]

Unnamed: 0_level_0,title,country,region,Total faculty member,International faculty member,Total student,International student
rank_display,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
52,New York University (NYU),United States,North America,,,,
190,Indian Institute of Science (IISc) Bangalore,India,Asia,423.0,,4071.0,47.0


In [23]:
# A quick google search let us find the correct values
columns_to_replace = ['Total faculty member', 'International faculty member', 'Total student', 'International student']
tu_ranking_dataframe[51:52].loc[:, columns_to_replace] = [15286, 604, 58547, 15000]
tu_ranking_dataframe[189:190].loc[:, columns_to_replace] = [423, 0, 4071, 47]
tu_ranking_dataframe = tu_ranking_dataframe.reset_index()

---

#### Times

In [24]:
times_url = 'http://www.timeshighereducation.com'
times_data_url = 'https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json'

ranking = requests.get(times_data_url).text
ranking = json.loads(ranking)['data']
times_ranking_dataframe = pd.io.json.json_normalize(ranking)
times_ranking_dataframe.head()

Unnamed: 0,aliases,location,member_level,name,nid,rank,rank_order,record_type,scores_citations,scores_citations_rank,...,scores_research,scores_research_rank,scores_teaching,scores_teaching_rank,stats_female_male_ratio,stats_number_students,stats_pc_intl_students,stats_student_staff_ratio,subjects_offered,url
0,University of Oxford,United Kingdom,0,University of Oxford,468,1,10,master_account,99.1,15,...,99.5,1,86.7,5,46 : 54,20409,38%,11.2,"Archaeology,Art, Performing Arts & Design,Biol...",/world-university-rankings/university-oxford
1,University of Cambridge,United Kingdom,0,University of Cambridge,470,2,20,master_account,97.5,29,...,97.8,3,87.8,3,45 : 55,18389,35%,10.9,"Archaeology,Architecture,Art, Performing Arts ...",/world-university-rankings/university-cambridge
2,California Institute of Technology caltech,United States,0,California Institute of Technology,128779,=3,30,private,99.5,10,...,97.5,4,90.3,1,31 : 69,2209,27%,6.5,"Architecture,Biological Sciences,Business & Ma...",/world-university-rankings/california-institut...
3,Stanford University,United States,11,Stanford University,467,=3,40,private,99.9,4,...,96.7,5,89.1,2,42 : 58,15845,22%,7.5,"Archaeology,Architecture,Art, Performing Arts ...",/world-university-rankings/stanford-university
4,Massachusetts Institute of Technology,United States,0,Massachusetts Institute of Technology,471,5,50,private,100.0,1,...,91.9,9,87.3,4,37 : 63,11177,34%,8.7,"Architecture,Art, Performing Arts & Design,Bio...",/world-university-rankings/massachusetts-insti...


In [25]:
times_ranking_dataframe.columns

Index(['aliases', 'location', 'member_level', 'name', 'nid', 'rank',
       'rank_order', 'record_type', 'scores_citations',
       'scores_citations_rank', 'scores_industry_income',
       'scores_industry_income_rank', 'scores_international_outlook',
       'scores_international_outlook_rank', 'scores_overall',
       'scores_overall_rank', 'scores_research', 'scores_research_rank',
       'scores_teaching', 'scores_teaching_rank', 'stats_female_male_ratio',
       'stats_number_students', 'stats_pc_intl_students',
       'stats_student_staff_ratio', 'subjects_offered', 'url'],
      dtype='object')

In [26]:
fields_to_keep = ['name', 'rank', 'location', 'url']

times_ranking_dataframe = times_ranking_dataframe[fields_to_keep].set_index('rank')

# Keep the 200 first university
times_ranking_dataframe = times_ranking_dataframe[:200]
times_ranking_dataframe.head()

Unnamed: 0_level_0,name,location,url
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,University of Oxford,United Kingdom,/world-university-rankings/university-oxford
2,University of Cambridge,United Kingdom,/world-university-rankings/university-cambridge
=3,California Institute of Technology,United States,/world-university-rankings/california-institut...
=3,Stanford University,United States,/world-university-rankings/stanford-university
5,Massachusetts Institute of Technology,United States,/world-university-rankings/massachusetts-insti...


In [27]:
# For all universities fetch its information page
times_ranking_dataframe.loc[:, 'html'] = times_ranking_dataframe.loc[:, 'url'].map(lambda url: BeautifulSoup(requests.get(times_url + url).text, 'html.parser')) 

In [28]:
def get_total_student(uni_page):
    try:
        raw_value = uni_page.find('div', class_='keystats number_students').previousSibling.contents[0]
        value = int(raw_value.strip().replace(',', ''))
    except:
        value = np.NaN
    return value

def get_ratio_inter_student(uni_page):
    try:
        raw_value = uni_page.find('div', class_='keystats pc_intl_students').previousSibling.contents[0]
        value = float(raw_value.strip().replace('%', '')) / 100 
    except:
        value = np.NaN
    return value

def get_ratio_faculty_member(uni_page):
    try:
        raw_value = uni_page.find('div', class_='keystats student_staff_ratio').previousSibling.contents[0]
        value = 1 / float(raw_value.strip().replace('%', '')) 
    except:
        value = np.NaN
    return value

def get_all_values(uni_page):
    total_student = get_total_student(uni_page)
    inter_student = math.ceil(get_ratio_inter_student(uni_page) * total_student)
    total_faculty_member = math.ceil(get_ratio_faculty_member(uni_page) * total_student)
    return [total_student, inter_student, total_faculty_member]

In [29]:
times_ranking_dataframe.loc[:, 'Total student'] = times_ranking_dataframe.loc[:, 'html'].map(lambda html: get_total_student(html))
times_ranking_dataframe.loc[:, 'International student'] = times_ranking_dataframe.loc[:, 'html'].map(lambda html: get_ratio_inter_student(html)) * times_ranking_dataframe.loc[:, 'Total student']
times_ranking_dataframe.loc[:, 'Faculty member'] = times_ranking_dataframe.loc[:, 'html'].map(lambda html: get_ratio_faculty_member(html)) * times_ranking_dataframe.loc[:, 'Total student']

columns_to_keep = ['rank', 'name', 'location', 'Total student', 'International student', 'Faculty member']
times_ranking_dataframe = times_ranking_dataframe.reset_index().loc[:, columns_to_keep] 


---

In [81]:
tu_ranking_dataframe.head()

Unnamed: 0,rank_display,title,country,region,Total faculty member,International faculty member,Total student,International student
0,1,Massachusetts Institute of Technology (MIT),United States,North America,2982.0,1679.0,11067.0,3717.0
1,2,Stanford University,United States,North America,4285.0,2042.0,15878.0,3611.0
2,3,Harvard University,United States,North America,4350.0,1311.0,22429.0,5266.0
3,4,California Institute of Technology (Caltech),United States,North America,953.0,350.0,2255.0,647.0
4,5,University of Cambridge,United Kingdom,Europe,5490.0,2278.0,18770.0,6699.0


In [82]:
times_ranking_dataframe.head()

Unnamed: 0,rank,name,location,Total student,International student,Faculty member
0,1,University of Oxford,United Kingdom,20409,7755.42,1822.232143
1,2,University of Cambridge,United Kingdom,18389,6436.15,1687.06422
2,=3,California Institute of Technology,United States,2209,596.43,339.846154
3,=3,Stanford University,United States,15845,3485.9,2112.666667
4,5,Massachusetts Institute of Technology,United States,11177,3800.18,1284.712644


In [382]:
# setting the school name as an index, since this will determine the merging of 2 dataframes
tu_ranking_dataframe.set_index('title', inplace = True)
times_ranking_dataframe.set_index('name', inplace = True)

In [383]:
# this function will determine the similarities between universities' names
def similarity_str(a, b):
    return (SequenceMatcher(None, a, b).ratio())

# this dataframe will conatin similarity values between uni names in tu and times dataframes
similarity_matrix = pd.DataFrame(columns=times_ranking_dataframe.index, index=tu_ranking_dataframe.index)

# fill in the inter-index similarity values
for tu_index_str in tu_ranking_dataframe.index:
    for times_index_str in times_ranking_dataframe.index:
        similarity_matrix.loc[tu_index_str][times_index_str] = similarity_str(tu_index_str, times_index_str)
        
similarity_matrix.head()

name,University of Oxford,University of Cambridge,California Institute of Technology,Stanford University,Massachusetts Institute of Technology,Harvard University,Princeton University,Imperial College London,University of Chicago,ETH Zurich – Swiss Federal Institute of Technology Zurich,...,University of Massachusetts,University of Auckland,Northeastern University,Lomonosov Moscow State University,Tilburg University,Paris-Sorbonne University – Paris 4,"Royal Holloway, University of London","University of California, Riverside",University of Gothenburg,National Taiwan University
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Massachusetts Institute of Technology (MIT),0.253968,0.242424,0.675325,0.16129,0.925,0.163934,0.222222,0.212121,0.28125,0.56,...,0.371429,0.276923,0.181818,0.184211,0.131148,0.153846,0.253165,0.205128,0.298507,0.231884
Stanford University,0.512821,0.47619,0.377358,1.0,0.214286,0.756757,0.666667,0.142857,0.5,0.210526,...,0.434783,0.487805,0.619048,0.538462,0.648649,0.518519,0.472727,0.37037,0.465116,0.622222
Harvard University,0.526316,0.487805,0.307692,0.756757,0.290909,1.0,0.631579,0.146341,0.512821,0.24,...,0.444444,0.5,0.634146,0.470588,0.666667,0.528302,0.481481,0.377358,0.47619,0.590909
California Institute of Technology (Caltech),0.3125,0.358209,0.871795,0.31746,0.641975,0.258065,0.25,0.238806,0.338462,0.574257,...,0.338028,0.333333,0.238806,0.155844,0.258065,0.303797,0.35,0.35443,0.352941,0.257143
University of Cambridge,0.744186,1.0,0.350877,0.47619,0.3,0.487805,0.465116,0.304348,0.772727,0.3,...,0.64,0.711111,0.434783,0.357143,0.487805,0.517241,0.508475,0.689655,0.723404,0.408163


In [384]:
# Chosing the merging choice depending on similarity value
decision_data = similarity_matrix.idxmax(axis=1);
decision_values = similarity_matrix.max(axis=1);

# magic constant 0.87, similarity string limit above which we consider unis the same
decision_data[decision_values < 0.871] = 'unknown' 
decision_data[decision_values == 1.0] = 'equal' 

similarity_matrix['decision'] = decision_data;
similarity_matrix[['decision']].head()

name,decision
title,Unnamed: 1_level_1
Massachusetts Institute of Technology (MIT),Massachusetts Institute of Technology
Stanford University,equal
Harvard University,equal
California Institute of Technology (Caltech),California Institute of Technology
University of Cambridge,equal


In [385]:
corr_names = similarity_matrix[['decision']].query('decision!="unknown"').query('decision!="equal"')['decision'];

# replacing names in TU with the names in TIMES (the decision)
rename_dictionary = dict(zip(corr_names.index, corr_names.values))
tu_ranking_dataframe_temp.rename(rename_dictionary, inplace = True)
    
tu_ranking_dataframe_temp.head()

Unnamed: 0_level_0,rank_display,country,region,Total faculty member,International faculty member,Total student,International student
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Massachusetts Institute of Technology,1,United States,North America,2982.0,1679.0,11067.0,3717.0
Stanford University,2,United States,North America,4285.0,2042.0,15878.0,3611.0
Harvard University,3,United States,North America,4350.0,1311.0,22429.0,5266.0
California Institute of Technology,4,United States,North America,953.0,350.0,2255.0,647.0
University of Cambridge,5,United Kingdom,Europe,5490.0,2278.0,18770.0,6699.0


In [386]:
#adding subcolumns before the merge to separate 2 datasets
tu_ranking_dataframe = pd.concat([tu_ranking_dataframe_temp], axis=1, keys=['TU'])
times_ranking_dataframe = pd.concat([times_ranking_dataframe_temp], axis=1, keys=['TIMES'])

In [431]:
#merging the dataframes
merged_dataframes = times_ranking_dataframe.join(tu_ranking_dataframe)

In [447]:
merged_dataframes.sort_values([('TIMES', 'rank') ], ascending = False, inplace=True)

In [448]:
merged_dataframes

Unnamed: 0_level_0,TIMES,TIMES,TIMES,TIMES,TIMES,TU,TU,TU,TU,TU,TU,TU
Unnamed: 0_level_1,rank,location,Total student,International student,Faculty member,rank_display,country,region,Total faculty member,International faculty member,Total student,International student
University of Basel,=95,Switzerland,12729,3436.83,719.152542,=149,Switzerland,Europe,1057.0,684.0,12852.0,3441.0
Korea Advanced Institute of Science and Technology (KAIST),=95,South Korea,9464,851.76,892.830189,,,,,,,
Rice University,=86,United States,6441,1803.48,715.666667,89,United States,North America,969.0,316.0,6610.0,1847.0
Uppsala University,=86,Sweden,25174,2769.14,1455.144509,112,Sweden,Europe,2567.0,584.0,25371.0,5198.0
University of Groningen,=83,Netherlands,26197,4453.49,1087.012448,113,Netherlands,Europe,3351.0,1137.0,27549.0,10780.0
Michigan State University,=83,United States,44556,7574.52,2620.941176,=149,United States,North America,2884.0,653.0,44951.0,7427.0
Monash University,=80,Australia,46846,13116.88,1390.089021,60,Australia,Oceania,3204.0,1679.0,57433.0,20578.0
University of Glasgow,=80,United Kingdom,23389,8420.04,1518.766234,65,United Kingdom,Europe,2775.0,956.0,23815.0,8149.0
Seoul National University,=74,South Korea,26470,2911.70,2100.793651,=36,South Korea,Asia,3930.0,433.0,28064.0,2331.0
Kyoto University,=74,Japan,22481,1798.48,2584.022989,=36,Japan,Asia,4060.0,293.0,22974.0,1990.0
