[Reference](https://prateekjoshi.medium.com/create-stunning-visualizations-using-altair-8a10c8882fdb)

In [1]:
!pip install opendatasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [2]:
import opendatasets as od
import os
# Retrieve data directly from source (using Kaggle API credentials, found in kaggle.json)
od.download("https://www.kaggle.com/datasets/mylesoneill/world-university-rankings",
           './data/raw')

raw_path = './data/raw/world-university-rankings/'
os.listdir(raw_path)

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: sangyunkang
Your Kaggle Key: ··········
Downloading world-university-rankings.zip to ./data/raw/world-university-rankings


100%|██████████| 1.41M/1.41M [00:00<00:00, 120MB/s]







['shanghaiData.csv',
 'timesData.csv',
 'educational_attainment_supplementary_data.csv',
 'school_and_country_table.csv',
 'education_expenditure_supplementary_data.csv',
 'cwurData.csv']

In [4]:
import pandas as pd
import altair as alt

In [11]:
%cd data/raw/world-university-rankings

/content/data/raw/world-university-rankings


In [12]:
!ls

cwurData.csv				       school_and_country_table.csv
educational_attainment_supplementary_data.csv  shanghaiData.csv
education_expenditure_supplementary_data.csv   timesData.csv


In [13]:
# Load data that we will use.
df = pd.read_csv("timesData.csv")
df.head()

Unnamed: 0,world_rank,university_name,country,teaching,international,research,citations,income,total_score,num_students,student_staff_ratio,international_students,female_male_ratio,year
0,1,Harvard University,United States of America,99.7,72.4,98.7,98.8,34.5,96.1,20152,8.9,25%,,2011
1,2,California Institute of Technology,United States of America,97.7,54.6,98.0,99.9,83.7,96.0,2243,6.9,27%,33 : 67,2011
2,3,Massachusetts Institute of Technology,United States of America,97.8,82.3,91.4,99.9,87.5,95.6,11074,9.0,33%,37 : 63,2011
3,4,Stanford University,United States of America,98.3,29.5,98.1,99.2,64.3,94.3,15596,7.8,22%,42 : 58,2011
4,5,Princeton University,United States of America,90.9,70.3,95.4,99.9,-,94.2,7929,8.4,27%,45 : 55,2011


In [14]:
df['year'].value_counts()

2016    800
2012    402
2015    401
2013    400
2014    400
2011    200
Name: year, dtype: int64

In [15]:
# filter data for latest year
df_latest = df[df['year']==df['year'].max()].reset_index(drop=True)

# Visualizations using Altair
## 1. Histogram

In [16]:
alt.Chart(df_latest.loc[:100]).mark_bar().encode(
    x = alt.X('research', bin = alt.BinParams(maxbins = 30),
              title="Research Score of Top 100 Universities"),
    
    y = 'count()' # aggregate function
)

## 2. Bar Chart


In [17]:
alt.Chart(df_latest.loc[:100]).mark_bar().encode(
    x = 'country',
    y = 'count()'
)

In [18]:
alt.Chart(df_latest.loc[:100]).mark_bar().encode(
    x = alt.X('country', sort = '-y'), # sort the bars
    y = alt.Y('count()', title = "Count of Universities")
)

In [19]:
alt.Chart(df_latest.loc[:100]).mark_bar().encode(
    x = alt.X('country', sort = '-y'),
    y = alt.Y('count()', title = "Count of Universities"),
    tooltip = 'count()' # information to display on mouse hover
).interactive()

## 3. Grouped Bar Chart


In [20]:
# bar chart to plot year-wise mean citation scores
bars = alt.Chart(df).mark_bar().encode(
        x='year:O', # 'O' suffix to treat year as ordinal input
        y=alt.Y('mean(citations):Q', title='Mean Citation'),
        color='year:N',
    )

# add a layer to group bars with respect to countries
alt.layer(bars).facet(
    column='country'
)

## 4. Scatter Plot


In [21]:
alt.Chart(df_latest.loc[:100]).mark_circle(size=100).encode(
    x=alt.Y('teaching', title = "Teaching Score"),
    y=alt.Y('citations', title = "Citation Score"),
    tooltip=['teaching', 'citations']
).interactive()

In [22]:
alt.Chart(df_latest.loc[:100]).mark_circle(size=100).encode(
    x=alt.Y('teaching', title = "Teaching Score"),
    y=alt.Y('citations', title = "Citation Score"),
    color = 'country',
    tooltip=['teaching', 'citations']
).interactive()

## 5. Scatter Matrix


In [23]:
alt.Chart(df_latest).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative')
).properties(
    width=200,
    height=200
).repeat(
    row=['teaching', 'research', 'citations'],
    column=['citations', 'research', 'teaching']
).interactive()

## 6. Line Chart


In [24]:
alt.Chart(df_latest.loc[:100,:]).mark_line(interpolate='basis').encode(
    x = alt.X('world_rank:Q', title = "University Rank"),
    y = alt.Y('citations:Q', title = "Citation Score")
)

## 7. Hybrid Chart


In [25]:
heatmap = alt.Chart(df_latest.loc[:100,:]).mark_rect().encode(
    alt.X('teaching:Q', bin=alt.BinParams(maxbins = 20), title = "Teaching Score"),
    alt.Y('research:Q', bin=alt.BinParams(maxbins = 20), title = "Research Score"),
    alt.Color('count():Q', scale=alt.Scale(scheme='greenblue'))
)
points = alt.Chart(df_latest.loc[:100,:]).mark_circle(color='black',size=10).encode(
    x='teaching:Q',
    y='research:Q',
)
# combine charts
heatmap + points