# Update Raw Data

## Libraries, options, paths, and functions

In [11]:
# import libraries
import pandas as pd
import numpy as np
from stats_can import StatsCan
import h5py

# set options
pd.set_option('display.float_format', '{:,.2f}'.format)

# define paths
sc = StatsCan(data_folder="./stats_can_data")

# define functions
def get_raw_data_ca(mev):
    result = sc.table_to_df(mev_names_ca['table'][mev])
    result.columns = result.columns.str.lower()
    result.columns = result.columns.str.replace(' ','_')
    return result


## Define variables

In [12]:
# define variable dictionary
mev_names_ca = pd.DataFrame(np.array([['pop', '17-10-0009-01', 'Population'],
                                 ['gdp', '36-10-0104-01', 'Gross Domestic Product']]),
                       columns=['mev', 'table', 'mev_description'])
mev_names_ca = mev_names_ca.set_index('mev')

display(mev_names_ca)

Unnamed: 0_level_0,table,mev_description
mev,Unnamed: 1_level_1,Unnamed: 2_level_1
pop,17-10-0009-01,Population
gdp,36-10-0104-01,Gross Domestic Product


## Get raw data

In [13]:
pop_raw = get_raw_data_ca('pop')
gdp_raw = get_raw_data_ca('gdp')

Downloading and loading table_17100009


17100009-eng.zip: 100%|██████████| 28.8k/28.8k [00:00<00:00, 1.09MB/s]


Downloading and loading table_36100104


36100104-eng.zip: 100%|██████████| 628k/628k [00:00<00:00, 2.49MB/s]


## Pre-process raw data

In [14]:
# pop

pop_ca = pop_raw[['ref_date','geo','value']].query('geo=="Canada"')
pop_ca = pop_ca.rename({'value': 'pop_ca'}, axis=1)
pop_ca['pop_ca'] = pop_ca['pop_ca'].astype(int)

display(pop_ca.tail())
# display(pop_ca.info())

Unnamed: 0,ref_date,geo,pop_ca
3825,2022-04-01,Canada,38644920
3839,2022-07-01,Canada,38929902
3853,2022-10-01,Canada,39292355
3867,2023-01-01,Canada,39566248
3881,2023-04-01,Canada,39858480


In [15]:
# gdp

gdp_ca = gdp_raw[['ref_date',
                  'geo',
                  'prices',
                  'seasonal_adjustment',
                  'estimates',
                  'value']].query('geo=="Canada" & \
                                   prices=="Chained (2012) dollars" & \
                                   seasonal_adjustment=="Seasonally adjusted at annual rates" & \
                                   estimates=="Gross domestic product at market prices"')
gdp_ca = gdp_ca[['ref_date','geo','value']].rename({'value': 'gdp_ca'}, axis=1)
gdp_ca['gdp_ca'] = gdp_ca['gdp_ca'].astype(int)

display(gdp_ca.tail())
# display(gdp_ca.info())

Unnamed: 0,ref_date,geo,gdp_ca
44247,2022-01-01,Canada,2155250
44399,2022-04-01,Canada,2174582
44551,2022-07-01,Canada,2186724
44703,2022-10-01,Canada,2185910
44855,2023-01-01,Canada,2202921


## Combine and save processed data

In [16]:
# create combined dataset

raw_mevs_ca = pop_ca.merge(gdp_ca[['ref_date','geo','gdp_ca']], on=['ref_date','geo'], how='inner')
display(raw_mevs_ca.tail())

save = pd.HDFStore('./stats_can_data/raw_mevs_ca.hdf5')
save.put('raw_mevs_ca', raw_mevs_ca)
metadata = {'scale':0.1,'offset':15}
save.get_storer('raw_mevs_ca').attrs.metadata = metadata
save.close()



Unnamed: 0,ref_date,geo,pop_ca,gdp_ca
244,2022-01-01,Canada,38516138,2155250
245,2022-04-01,Canada,38644920,2174582
246,2022-07-01,Canada,38929902,2186724
247,2022-10-01,Canada,39292355,2185910
248,2023-01-01,Canada,39566248,2202921
