In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
df = pd.read_csv("../input/world-university-ranking-2022-2023/WORLD UNIVERSITY RANKINGS.csv")

### Below is a snapshot of the data

In [None]:
df.head()

In [None]:
df.shape

The data has 2000 data points with 9 features described below:

* World Rank: Global ranking of the University
* Institution: Name of the Institution or the University
* Location: Country where the Institution is located
* National Rank: Rank of the University in the Country where it's located
* Education Rank: Global rank metric derived from the number of university's alumni who have won prestigious academic distinctions.
* Employability Rank: Global rank metric derived from the professional success of university’s alumni, and measured by the number of a university's alumni who have held top positions at major companies.
* Faculty Rank: Global rank metric derived from the number of faculty members who have won prestigious academic distinctions.
* Research Rank: Global rank of the university based on the research output, high quality publications, citation and research influence.
* Score: A metric generated using all the other feature representing the overall performance of a University.

In [None]:
df.describe()

We can see that some of the features are missing, let's try to incorporate them by cleaning the data.

In [None]:
df = df.replace('-',np.nan)
df[['World Rank','National Rank','Education Rank','Employability Rank',
    'Faculty Rank','Research Rank','Score']] = df[['World Rank','National Rank','Education Rank','Employability Rank',
                                                   'Faculty Rank','Research Rank','Score']].astype(float)

In [None]:
df.describe()

Above we can see the basic statistics of the numerical features in the data.

### Let's see the top 10 countries with the most universities in the dataset:

In [None]:
df['Location'].value_counts()[:10].plot(kind='bar')
plt.show()

### Distribution of world ranking of the universities in top 10 countries (with most universities).

In [None]:
countries = df['Location'].value_counts()[:10].index
dd = df[df['Location'].isin(countries)]
# fig = px.violin(dd, x="Location", y="World Rank", box=True)
fig = px.box(dd, x="Location", y="World Rank")
fig.show()

From the above plot, it is observable that: 
* UK, USA, France and China have most variance in University rankings, meaning they have universities with a wide variety of ranks.
* Turkey and Japan have the least variance with median around 1300, meaning they have universitits that globally ranked around 1300.
* Germany has the lowest median global university ranking, meaning Germany has a lot of highly ranked universities.

### Distribution of National ranking of the universities in top 10 countries (with most universities).

In [None]:
countries = df['Location'].value_counts()[:10].index
dd = df[df['Location'].isin(countries)]
# fig = px.violin(dd, x="Location", y="National Rank", box=True)
fig = px.box(dd, x="Location", y="National Rank")
fig.show()

From the above plot, it is observable that: 
* USA and China have most variance in University national rankings, half of them have a ranking between 75 and 350.
* The other countries have universities with competing national rankings, most of them have a ranking less than 100.

In [None]:
countries = df['Location'].value_counts()[:10].index
dd = df[df['Location'].isin(countries)]
# fig = px.violin(dd, x="Location", y="Education Rank", box=True)
fig = px.box(dd, x="Location", y="Education Rank")
fig.show()

From the above plot, it is observable that: 
* USA, Germany, and UK have universities with the best education ranking based on the median ranks.
* India, China, and Turkey have the worst education ranking (still in top 10) based on the median ranks.

In [None]:
countries = df['Location'].value_counts()[:10].index
dd = df[df['Location'].isin(countries)]
# fig = px.violin(dd, x="Location", y="Education Rank", box=True)
fig = px.box(dd, x="Location", y="Employability Rank")
fig.show()

From the above plot, it is observable that: 
* USA, France, and India have universities with the best employability based on the median ranks.
* Turkey, Italy, and China have universities with not so good employability based on the median ranks.

In [None]:
countries = df['Location'].value_counts()[:10].index
dd = df[df['Location'].isin(countries)]
# fig = px.violin(dd, x="Location", y="Education Rank", box=True)
fig = px.box(dd, x="Location", y="Faculty Rank")
fig.show()

From the above plot, it is observable that: 
* USA, UK, Japan, France and India have universities with the best faculty based on the median ranks.

Notice the skewness in the box-plot for France and India, most of the universities in these countries have a high faculty rank.
* Italy and China have universities with not so good faculty rankings based on the median ranks.

Notice the negative skewness in the box-plot of Italy, most of the university in Italy have a low faculty ranking.

In [None]:
countries = df['Location'].value_counts()[:10].index
dd = df[df['Location'].isin(countries)]
# fig = px.violin(dd, x="Location", y="Education Rank", box=True)
fig = px.box(dd, x="Location", y="Research Rank")
fig.show()

From the above plot, it is observable that: 
* Germany, Italy, and USA top in the research rank based on the median ranks.
* Turkey, India, and Japan have universities with not so good research ranks based on the median ranks.

### Correlation between different variables

In [None]:
dd = df.copy().dropna()
dg = dd.corr()

correlation = dg.corr()

fig = px.imshow(correlation, text_auto=True)
fig.show()

In the correlation map, we can see that the National rank is vary loosely correlated with all the other features. This could be because the national rank are not distinct since multiple countries can have university nationally ranked as 3rd whereas the global ranking (for example) will be unique.

### Other feature that is most loosely related to the other feature is "Employability Rank", Let's dig deeper:

I'll plot a correlation between World rank and the Employability for each country and check which ones have the highest correlation.

In [None]:
dx = df[['World Rank','Employability Rank','Location']].dropna()
dx['Employability Rank'] = dx['Employability Rank'].astype(float)
countries = dx['Location'].value_counts()
countries = countries.index[countries>10]
dx = dx[dx['Location'].isin(countries)]
dx = dx.groupby('Location').corr().iloc[::2].dropna()
dy = pd.DataFrame()
dy['Location'] = [i[0] for i in dx.index.values]
dy['Correlation between Employability and World Rank'] = dx['Employability Rank'].values
dy = dy.sort_values(by='Correlation between Employability and World Rank', ascending=False)
px.bar(dy, x='Location', y='Correlation between Employability and World Rank')

From the above plot, we can see that Sweden, Turkey and Australia have the highest positive correlation between Employability and World Rank whereas, USA, China, and Italy have the least positive correlation.

### Next I would like to plot a PCA to see if any pattern exists in the Data.

In [None]:
from sklearn.preprocessing import StandardScaler

dd = df.copy().dropna()
features = ['World Rank','National Rank','Education Rank','Employability Rank','Faculty Rank','Research Rank','Score']
dd[features] = StandardScaler().fit_transform(dd[features])
dd.head()

In [None]:
dd_numerical = dd[features]
dd_categorical = dd[['Institution','Location']]

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
principalComponents = pca.fit_transform(dd_numerical)
principalDf = pd.DataFrame(data=principalComponents, columns = ['principal component 1','principal component 2'])

all_features = dd.columns
principalDf[all_features] = df.copy().dropna()

principalDf.head()

In [None]:
fig = px.scatter(principalDf, x="principal component 1", y="principal component 2", color='Location', hover_data=all_features)
fig.show()

The above plot is simply a 2-Dinemsional visual representation of the overall feature, one observable trend here is the universities are located from left to right based on their rankings. Universities with high rankings in all the fields are on the left whereas Universities with high rankings in all the fields are on the right.