In [2]:
# Importing library
import altair as alt
print(alt.__version__)

alt.data_transformers.disable_max_rows()

5.0.1


DataTransformerRegistry.enable('default')

### **Data Preparation**

In [3]:
import pandas as pd

# load dataset
df = pd.read_excel('Canada.xlsx', sheet_name='Canada by Citizenship', skiprows=range(20), skipfooter=2)
display(df.head())

Unnamed: 0,Type,Coverage,OdName,AREA,AreaName,REG,RegName,DEV,DevName,1980,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
0,Immigrants,Foreigners,Afghanistan,935,Asia,5501,Southern Asia,902,Developing regions,16,...,2978,3436,3009,2652,2111,1746,1758,2203,2635,2004
1,Immigrants,Foreigners,Albania,908,Europe,925,Southern Europe,901,Developed regions,1,...,1450,1223,856,702,560,716,561,539,620,603
2,Immigrants,Foreigners,Algeria,903,Africa,912,Northern Africa,902,Developing regions,80,...,3616,3626,4807,3623,4005,5393,4752,4325,3774,4331
3,Immigrants,Foreigners,American Samoa,909,Oceania,957,Polynesia,902,Developing regions,0,...,0,0,1,0,0,0,0,0,0,0
4,Immigrants,Foreigners,Andorra,908,Europe,925,Southern Europe,901,Developed regions,0,...,0,0,1,1,0,0,0,0,1,1


In [4]:
# 
df[df.duplicated()]

Unnamed: 0,Type,Coverage,OdName,AREA,AreaName,REG,RegName,DEV,DevName,1980,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013


In [5]:
# Menghapus kolom yang tidak dibutuhkan
cols = ['Type', 'Coverage', 'AREA', 'REG', 'DEV', 'DevName', 'RegName']
df.drop(columns=cols, inplace=True)

In [16]:
# Mengubah data dengan wide format menjadi long format
df_melt = df.melt(id_vars=['OdName', 'AreaName'], value_vars=df.columns[2:])

# Menyesuaikan tipe data
df_melt['variable'] = df_melt['variable'].astype(str)

# show
df_melt

Unnamed: 0,OdName,AreaName,variable,value
0,Afghanistan,Asia,1980,16
1,Albania,Europe,1980,1
2,Algeria,Africa,1980,80
3,American Samoa,Oceania,1980,0
4,Andorra,Europe,1980,0
...,...,...,...,...
6625,Viet Nam,Asia,2013,2112
6626,Western Sahara,Africa,2013,0
6627,Yemen,Asia,2013,217
6628,Zambia,Africa,2013,59


**1. Visualize the trend of total immigration to Canada (all countries combined) for the years 1980 to 2013**

In [15]:
# Menghitung total imigran for the years 1980 to 2013
source = df_melt.groupby('variable').agg(func={'value': 'sum'}).reset_index()

source.head()

Unnamed: 0,variable,value
0,1980,99137
1,1981,110563
2,1982,104271
3,1983,75550
4,1984,73417


In [9]:
# Membuat grafik line chart
alt.Chart(source).mark_area(
    line={'color': 'black'},
    color=alt.Gradient(
        gradient='linear',
        stops=[alt.GradientStop(color='white', offset=0),
               alt.GradientStop(color='darkgreen', offset=1)],
        x1=1, y1=1
    )
).encode(
    alt.X('variable:T', title='Year'),
    alt.Y('value', title='Total Immigrants'),
    tooltip=[
        alt.Tooltip('variable:T', format='%Y', title='year'),
        alt.Tooltip('value', title='Total', format='.4s'),
    ]
).properties(
    title=alt.Title(
        "Annual Immigration Totals Surge to Nearly 190k People",
        subtitle='Immigration Trends from 1981 to 2013',
        anchor='start',
        font='Calibri',
        fontSize=18,
        offset=20,
    ),
    width=700, height=250
)

**2. Compare the trend of top 5 countries that contributed the most to immigration to Canada from 1980 to 2013.**

In [17]:
# Mencari 5 nama negara dengan total imigrant tertinggi
top5 = df_melt.groupby('OdName').agg(func={'value': 'sum'}).nlargest(5, 'value').index.to_list()

# Filter dataframe berdasarkan top 5 negara
source = df_melt[df_melt.OdName.isin(top5)]

# show
source.head()

Unnamed: 0,OdName,AreaName,variable,value
36,China,Asia,1980,5123
79,India,Asia,1980,8880
130,Pakistan,Asia,1980,978
136,Philippines,Asia,1980,6051
183,United Kingdom of Great Britain and Northern I...,Europe,1980,22045


In [18]:
# Membuat grafik streamgraph
alt.Chart(source).mark_area().encode(
    alt.X('variable:T', title='year').axis(format='%Y', domain=False, tickSize=0),
    alt.Y('value').stack('center').axis(None),
    alt.Color('OdName').scale(scheme='redblue', reverse=True).legend(title='Country'),
    tooltip=[
        alt.Tooltip('OdName', title='country'),
        alt.Tooltip('variable:T', format='%Y', title='year'),
        alt.Tooltip('value', format='.2s', title='imigrant'),
    ]
).properties(
    title=alt.Title(
        "India and China are Consistently The Largest Immigrant Contributors ",
        subtitle='Immigrant Dynamics by Top 5 Countries',
        anchor='start',
        font='Calibri',
        fontSize=18,
        offset=20,
    ),
    width=700, height=250
)

**3. Explore the proportion (percentage) of new immigrants grouped by continents in the year 2013.**

In [19]:
# Filter data
mask = df_melt.variable == '2013'

# Menghitung total imigrant per continent
source = df_melt[mask].groupby('AreaName').agg(func={'value':'sum'}).reset_index()

# Menghitung besar persentase 
source['pct'] = source.value / source.value.sum()

# Renaming
source.replace({'Latin America and the Caribbean': 'Latin America'}, inplace=True)

# show
source.head()

Unnamed: 0,AreaName,value,pct
0,Africa,38543,0.14966
1,Asia,155075,0.602146
2,Europe,28691,0.111405
3,Latin America,24950,0.096879
4,Northern America,8503,0.033017


In [20]:
# Set color scale
color_scale = alt.Scale(
        domain=source.AreaName.tolist(), 
        range=['#DF8142' if i == 'Asia' else 'grey' for i in source.AreaName])

# Menampilkan bar chart
alt.Chart(source).mark_bar().encode(
    x=alt.X('pct', title='Percentage of Immigrant').axis(format='%'),
    y=alt.Y('AreaName', title=None, sort='-x'),
    tooltip=alt.Tooltip('pct', format='.2%'),
    color=alt.Color('AreaName', scale=color_scale, legend=None),
).properties(
    title=alt.Title(
        "60% of Immigrants in Canada come from Asia",
        subtitle='Immigrant Percentage (%) by Continent',
        anchor='start',
        font='Calibri',
        fontSize=18,
        offset=20,
    ),
    width=500, height=250,
)