# EDA of QS World University Rankings 2017 - 2022<br>
The dataset is <a href= "https://www.kaggle.com/datasets/padhmam/qs-world-university-rankings-2017-2022"> QS World University Rankings (2017 - 2022)</a>,posted on Kaggle by user Padhma Muniraj.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [5]:
#qs-world-university-rankings-2017-to-2022-V2.csv

qs=pd.read_csv('../input/qs-world-university-rankings-2017-2022/qs-world-university-rankings-2017-to-2022-V2.csv')

In [6]:
qs.head()

In [7]:
qs.info()

Drop Variables link and logo because we won't need them for the analysis.

In [8]:
qs.drop(['link','logo'], axis=1, inplace=True)
qs

Check to see the number of missing values in each variable

In [9]:
round(qs.isnull().sum())

Check to see the percentage of missing values in each variable

In [10]:
round(qs.isnull().sum() * 100 / len(qsDF))

Visualizing the missing values

In [12]:
sns.displot(
    data=qs.isnull().melt(value_name="missing"),
    y='variable', hue='missing',multiple='fill',aspect=2)
plt.title('Missing Data')

   The variable score has 56.5% missing values so it would be apprioprate to remove it from the dataset.

In [13]:
qs.drop(['score'], axis=1, inplace=True)
qs

In [14]:
qs.info()

To make the Correlation matrix, we first need to convert our quantitative variables into numerical variables

In [15]:
import re

In [18]:
qs['faculty_count'] = qs.faculty_count.str.replace(r'\W+','').astype(float) #to replace both '.' & ','
qs['international_students'] = qs.international_students.str.replace(r'\W+','').astype(float)

In [19]:
qs.info()

In [20]:
sns.heatmap(qs.corr(),cmap='inferno',annot=True)
plt.title('qs Correlation Matrix')


There seems to be a weak positive association between variables international_students and faculty_count; and a negligible negative association between variables faculty_count and student_faculty_ratio.

In [26]:
sns.set_style('dark')
sns.lmplot(data=qs,x='international_students',y='faculty_count',
          aspect=2,height=8)
plt.title('Linear Regression Plot(No.of Students vs No.of Faculties)', fontsize=18,
          color="red")
plt.xlabel('No of International Students')
plt.ylabel('No. of Faculties')

In [38]:
year_df = qs['year'].value_counts().sort_values()
fig, ax = plt.subplots(figsize=(10,4), dpi=100)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.tick_params(bottom=False)
ax.get_yaxis().set_visible(False)

sns.countplot(data=qs, x='year');

# add values on top of each bar
ax.bar_label(ax.containers[0])

ax.set_xlabel('Year', fontsize=14, color = '#ff4800');
fig.suptitle('Number of universities ranked over the years', fontsize=14, color = '#ff4800');

In [33]:
type_df = qs['type'].value_counts()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,5))


explode = [0,0.1]
ax1.pie(qs['type'].value_counts().values, labels = qs['type'].value_counts().index, explode=explode, autopct='%1.1f%%') 
ax1.axis('equal')

ax2.bar(qs['type'].value_counts().index, qs['type'].value_counts().values) 
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines['left'].set_visible(False)
ax2.tick_params(axis='both', which='both', labelsize=10, left=False, bottom=False)
ax2.get_yaxis().set_visible(False)
plt.title("University Types", fontsize=15, color = '#ff4800');

ax2.bar_label(ax2.containers[0])

fig.tight_layout()
fig.subplots_adjust(wspace=0.8)

In [39]:
sns.set_style('dark')
plt.figure(figsize=(10,12))
sns.barplot(
    data=qsDF,x="type", y='international_students', hue='year',palette='viridis')
plt.legend(bbox_to_anchor=(1, 1), loc='upper right')
plt.title('Number of international students in private and public universities from 2017-2022', fontsize=20,
          color="purple")
plt.ylabel('Number of International Students')
plt.xlabel('Type of University')

There was a higher number of international students in public universities compared to private universities across all years.

New dataframe that contains info on the top 100 universities

First, let's convert the variable rank_display to a float

In [40]:
qs['rank_display'] = qs.rank_display.str.replace('-','').astype(float) 

In [41]:
newdf = qs[(qs['rank_display']<=100) ]
newdf


Distribution of the top 100 Universities accross the globe

Year 2021

In [47]:
newdf2021 = newdf[(newdf['year']==2021) ]
newdf2021

In [48]:
import plotly.express as px

In [49]:
px.choropleth(newdf2021,
              locations='country',
              color='university',
              labels= {'rank_display':'University Ranking', 'type':'Type of University','country':'Country', 
                      'city':'City','research_output':'Research Output','university':'University'},
              locationmode = 'country names',
              projection='natural earth',
           hover_data=['country','city','type','research_output'],
             title='Top 100 World University Rankings 2021',
             animation_frame='rank_display')

Year 2022

In [50]:
newdf2022 = newdf[(newdf['year']==2022) ]
newdf2022

In [51]:
px.choropleth(newdf2022,
              locations='country',
              color='university',
              labels= {'rank_display':'University Ranking', 'type':'Type of University','country':'Country', 
                      'city':'City','research_output':'Research Output','university':'University'},
              locationmode = 'country names',
              projection='natural earth',
              hover_data=['country','city','type', 'research_output', 'rank_display'],
             title='Top 100 World University Rankings 2022',
             animation_frame='rank_display')

In [63]:
long_palette = ["#FA6E4F", "#F2CF59", "#FB8E7E", "#C5D7C0", "#8EC9BB", "#F8CA9D", '#F69EAF', '#8F8CBC', '#7C5396', '#EA6382', '#6BEAF3', '#5A9DE2', '#DDAD64', '#EA876B', '#B98174', '#357866', '#625586', '#647B99']
custom_palette1 = sns.color_palette(long_palette)
uni_df = qs['university'].value_counts()

fig, ax = plt.subplots(figsize=(10,20), dpi=150)

sns.countplot(data=qs, y='country', order=qs.country.value_counts().index,palette=custom_palette1);
plt.xlabel('Number of universities', fontsize=12, color = '#ff4800')
plt.ylabel('Country', fontsize=12, color = '#ff4800')
plt.title("Distribution of universities across countries", fontsize=14, color = '#ff4800');

# plt.savefig('countrywise.png')

Top 100 World Universities 2017-2022: Preferred Countries for International students

In [64]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [69]:
fig = make_subplots(1, 2, specs=[[{'type':'Pie'}, {'type':'Pie'}]],
                    subplot_titles=['2022', '2021'])
fig.add_trace(go.Pie(labels=(newdf2022['country']), values=(newdf2021['international_students']),
                     hoverinfo='label+value+percent', textinfo='none'), 1, 1)
fig.add_trace(go.Pie(labels=(newdf2021['country']), values=(newdf2021['international_students']),
                     hoverinfo='label+value+percent', textinfo='none'), 1, 2)

fig.update_layout(title_text='Top 100 World Universities 2022-2021: Preferred Countries for International students')
fig.show()

In [77]:
intstu_country = pd.DataFrame(qs.groupby(['country'], sort=False)['international_students'].sum().sort_values(ascending=False)[:10])


In [79]:
fig, ax = plt.subplots(figsize=(10,4), dpi=100)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.tick_params(left=False)
ax.get_xaxis().set_visible(False)

sns.barplot(data=intstu_country, x='international_students', y=intstu_country.index, palette=custom_palette1);

ax.bar_label(ax.containers[0], fmt = '%d')

ax.set_ylabel('Country', fontsize=13, color = '#ff4800');
fig.suptitle('Country of choice for International Students from 2017 - 2022', fontsize=14);

In [84]:
# Top universities
topuni = qs[qs['rank_display'] <= 10]

# Line chart showing each year top universities
fig, ax = plt.subplots(figsize=(10,4), dpi=150)
sns.lineplot(data=topuni, x="year", y="rank_display", hue="university", style="university",markers=True, dashes=False)
# Put a legend under the chart
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.2))
# set 1 as top y axis
plt.gca().invert_yaxis()
# show all y axis
plt.yticks(topuni['rank_display'])

plt.title("Top Universities Ranking Over the Years", fontsize=15);
