In [1]:
import pandas as pd
import altair as alt
import numpy as np
from altair import datum

alt.data_transformers.disable_max_rows()
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

# DATA 551 Exploratory Data Analysis
# Gross Domestic Product by Province and Territories

#### Contributors:
Vicens Paneque Fernandez, 85493799\
Bohan Gao, 25611161

## 1. Describe the dataset

The ***'Canada Gross domestic product (GDP) at basic prices, by industry, provinces and territories'*** dataset, available from Canada's national statistical office, Statistics Canada, at https://www150.statcan.gc.ca/t1/tbl1/en/cv.action?pid=3610040201, provides Annual Provincial and Territorial Gross Domestic Product (GDP) at basic prices (from 1997 to 2019), by North American Industry Classification aggregates, in chained (2012) and current dollars (dollars x 1,000,000).

**Source:** Statistics Canada. Table 36-10-0402-01  Gross domestic product (GDP) at basic prices, by industry, provinces and territories (x 1,000,000)\
**DOI:** https://doi.org/10.25318/3610040201-eng

* Statistics Canada provides and Open Licence issued on behalf of Her Majesty the Queen in Right of Canada, as represented by the Minister for Statistics Canada ("Statistics Canada")

## 2. Load the dataset

In [2]:
gdp = pd.read_csv('GDP.csv', skiprows=[i for i in range(0,4)])

## 3. Explore the dataset

In [3]:
gdp.head()

Unnamed: 0,Geography,North American Industry Classification System (NAICS) 7 8,Reference period,Chained (2012) dollars 9
0,,,,Dollars
1,Newfoundland and Labrador,Goods-producing industries [T002] 10,1997.0,6353.5
2,,,1998.0,7175.6
3,,,1999.0,8293.9
4,,,2000.0,9305.3


In [4]:
gdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8699 entries, 0 to 8698
Data columns (total 4 columns):
 #   Column                                                     Non-Null Count  Dtype  
---  ------                                                     --------------  -----  
 0   Geography                                                  40 non-null     object 
 1   North American Industry Classification System (NAICS) 7 8  398 non-null    object 
 2   Reference period                                           8671 non-null   float64
 3   Chained (2012) dollars 9                                   8672 non-null   object 
dtypes: float64(1), object(3)
memory usage: 272.0+ KB


In [5]:
gdp.describe()

Unnamed: 0,Reference period
count,8671.0
mean,2008.0
std,6.633632
min,1997.0
25%,2002.0
50%,2008.0
75%,2014.0
max,2019.0


In [6]:
gdp.shape

(8699, 4)

## 4. Initial thoughts

* The dataset includes numerous null values that need to be handled in order to continue with the analysis.
* The data types of the columns also need to be treated before continuing with the analysis.
* By having a breakdown of the different industries and geographies across Canada, it is possible to understand the evolution of the principal components of GDP. This breakdown could also be used to further model the relationship between attributes and make predictions.
* The dataset has information from 1997 to 2019, therefore we won't be able to investigate the economic impacts of the Covid-19 pandemic.

## 5. Wrangling

In [7]:
# Handling null values and deleting non usable information.
gdp.drop(gdp.index[0], inplace=True)
gdp['Geography'] = gdp['Geography'].fillna(method='ffill')
gdp['North American Industry Classification System (NAICS) 7 8'] = gdp['North American Industry Classification System (NAICS) 7 8'].fillna(method='ffill')
gdp.reset_index(inplace=True)
gdp.drop('index', axis=1, inplace=True)
gdp = gdp[:-27]
gdp = gdp.replace(',','', regex=True)
gdp.replace(to_replace ="..", value ="0", inplace= True) 

# Indicating the correct data types
gdp['North American Industry Classification System (NAICS) 7 8'] = gdp['North American Industry Classification System (NAICS) 7 8'].str.split('[').str[0]
gdp['Geography'] = gdp['Geography'].astype('string')
gdp['North American Industry Classification System (NAICS) 7 8'] = gdp['North American Industry Classification System (NAICS) 7 8'].astype('string')
gdp['Reference period'] = pd.to_numeric(gdp['Reference period'])
gdp['Chained (2012) dollars 9'] = pd.to_numeric(gdp['Chained (2012) dollars 9'])

# Renaming the columns
gdp.rename(columns={"North American Industry Classification System (NAICS) 7 8": "Industry", "Reference period": "Year", "Chained (2012) dollars 9": "GDP"}, inplace=True)

gdp.head()

Unnamed: 0,Geography,Industry,Year,GDP
0,Newfoundland and Labrador,Goods-producing industries,1997.0,6353.5
1,Newfoundland and Labrador,Goods-producing industries,1998.0,7175.6
2,Newfoundland and Labrador,Goods-producing industries,1999.0,8293.9
3,Newfoundland and Labrador,Goods-producing industries,2000.0,9305.3
4,Newfoundland and Labrador,Goods-producing industries,2001.0,9596.3


## 6. Research questions

From a Province or Territory perspective in a given year:

1. What is the GDP?
2. What has been its trend?
3. Which are it’s main components (industry contributions)? 
4. How does it compare to other geographies?

## 7. Data analysis and visualizations

In [8]:
# Dropdown menu items

year = 2019
Geography = 'British Columbia'

In [9]:
bar = alt.Chart(gdp, title="Total GDP ").mark_bar(size=80).transform_aggregate(
        groupby =['Geography', 'Year'], GDP='sum(GDP)').encode(
            x=alt.X('sum(GDP):Q', title='GDP (dollars x 1,000,000)', axis=alt.Axis(grid=False, ticks=False, labels=False, labelFontSize=10)), 
            y=alt.Y('Geography:O', sort='-x', title=None, axis=alt.Axis(grid=False, labelFontSize=20)),
            tooltip=[alt.Tooltip('sum(GDP):Q', format=('$,.2f'), title='Total GDP $')]).transform_filter(
                alt.FieldEqualPredicate(field='Geography', equal= Geography)).transform_filter(
                alt.FieldEqualPredicate(field='Year', equal= year)).properties(height=200, width=400)

total_gdp = bar.mark_text(dx=-175, color='darkblue', size=60).encode(text=alt.Text('sum(GDP):Q', format=('$,.2f'))).configure_view(strokeOpacity=0)

total_gdp

In [10]:
historical_gdp = alt.Chart(gdp, title="GDP Historical Evolution").mark_line(point=alt.OverlayMarkDef(filled=False, fill='darkblue'), size=5).encode(
                    x=alt.X('Year', axis=alt.Axis(grid=False, ticks=False, format='Y', labelFontSize=10), title='Year'),
                    y=alt.Y('sum(GDP):Q', axis=alt.Axis(grid=False, ticks=False, format=('$,f'), labelFontSize=10), title='GDP (dollars x 1,000,000)'),
                    tooltip = [alt.Tooltip('Year'), alt.Tooltip('sum(GDP):Q', format=('$,.2f'), title='Total GDP $')]).transform_filter(
                        alt.FieldEqualPredicate(field='Geography', equal= Geography)).transform_filter(
                        alt.FieldRangePredicate('Year',[1997,year])).configure_view(strokeOpacity=0).properties(height=200, width=400)

historical_gdp

In [11]:
industry_gdp = alt.Chart(gdp, title="GDP Industry Contribution").mark_bar().encode(
                x=alt.X('sum(GDP):Q', title='GDP (dollars x 1,000,000)', axis=alt.Axis(format='$,f', labelFontSize=10)), 
                y=alt.Y('Industry:O', sort='-x'),
                color=alt.Color('sum(GDP)', title='Total GDP', scale=alt.Scale(scheme='lighttealblue')),
                tooltip=[alt.Tooltip('Industry'), alt.Tooltip('sum(GDP):Q', format=('$,.2f'), title='Total GDP $')]).transform_filter(
                    alt.FieldEqualPredicate(field='Geography', equal= Geography)).transform_filter(
                    alt.FieldEqualPredicate(field='Year', equal= year)).configure_view(strokeOpacity=0)

industry_gdp

In [12]:
geo_gdp = alt.Chart(gdp, title="GDP Province Contribution").mark_bar().encode(
                y=alt.Y('sum(GDP):Q', title='GDP (dollars x 1,000,000)', axis=alt.Axis(format='$,f', labelFontSize=10)), 
                x=alt.X('Geography:O', sort='-y', title=None, axis=alt.Axis(labelFontSize=10, labelAngle=-90)),
                color=alt.Color('sum(GDP)', title='Total GDP', scale=alt.Scale(scheme='lighttealblue')),
                tooltip=[alt.Tooltip('Geography'), alt.Tooltip('sum(GDP):Q', format=('$,.2f'), title='Total GDP $')]).transform_filter(
                    alt.FieldEqualPredicate(field='Year', equal= year)).configure_view(strokeOpacity=0).properties(height=400, width=800)

geo_gdp

In [13]:
heatmap= alt.Chart(gdp).mark_rect().encode(
            x=alt.X('Geography:O', sort='-color', title=None, axis=alt.Axis(labelFontSize=10, labelAngle=-90)),
            color=alt.Color('sum(GDP)', title='Total GDP', scale=alt.Scale(scheme='lighttealblue')),
            tooltip=[alt.Tooltip('Geography'), alt.Tooltip('sum(GDP):Q', format=('$,.2f'), title='Total GDP $')]).transform_filter(
                    alt.FieldEqualPredicate(field='Year', equal= year)).properties(height=400, width=800)



heatmap

## 8. Summary and conclusions

The visualizations provided displays the total and the historical evolution of GDP. It allow users to compare across different regions and explore different aspects of this data by selecting the geography of interest to display information on it its biggest industries and filtering by year.

## 9. Follow-up research questions 

From a industry perspective in a given year:

1. What is the GDP?
2. What has been its trend?
3. Which are it’s main components (geography contributions)? 
4. How does it compare to other industries?