<a href="https://colab.research.google.com/github/FleaBusyBeeBergs/dtsa5506-pipeline/blob/main/dtsa5506-pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# uncomment to install 

# !pip install pandas requests pandasdmx
# !pip install requests-cache 

In [8]:
import zipfile

import pandas as pd
import os


import requests

import pandasdmx # sdmx api library
from pandasdmx import Request
from requests_cache import install_cache
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET

In [3]:
# # enable caching
# install_cache("statcan_cache", expire_after = 3600)  # cache expires after 1 hour
# 
# # init sdmx client for statistics canada
# client = Request('STATCAN')

## The Data

The data used comes from the statistics canada website and is found in the following tables:

* wages, table 14-10-0223-01
* raw materials index, table 18-10-0268-01
* taxes, table 11-10-0058-01
* productivity, table 36-10-0206-01
* consumer price index, table 18-10-004-01

In [26]:
# create df of tables
tables = {
    'name': ['wage', 'raw', 'tax', 'productivity', 'cpi'],
    'tableid': [14100223, 18100268, 11100058, 36100206, 18100004],
    'vector': ['v79311153', 'v1230998135', 'v122807833', 'v1409153', 'v41690973'],
    'description': ['', '', '', '', '']
         }
table_df = pd.DataFrame(tables)
table_df

Unnamed: 0,name,tableid,vector,description
0,wage,14100223,v79311153,
1,raw,18100268,v1230998135,
2,tax,11100058,v122807833,
3,productivity,36100206,v1409153,
4,cpi,18100004,v41690973,


## EDA

In [21]:
# filter out coordinates 1.2.2

wage_df = pd.read_csv('data/14100223.csv')
# print(wage_df.info)
wage_filter = wage_df[wage_df['COORDINATE'] == '1.2.2'] #v79311153

In [22]:
# create df with column for each variable

df = wage_filter[['REF_DATE', 'VALUE']]
df = df.rename(columns = {'REF_DATE': 'date', 'VALUE': 'wage'})
df['date'] = pd.to_datetime(df['date'])
df['date'] = df['date'].dt.strftime('%Y-%m')
df.head()

Unnamed: 0,date,wage
28,2001-01,657.14
784,2001-02,653.59
1540,2001-03,655.14
2296,2001-04,652.44
3052,2001-05,652.28


In [24]:
rawmat = pd.read_csv('data/18100268.csv')


Unnamed: 0,REF_DATE,GEO,DGUID,North American Product Classification System (NAPCS),UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,1981-01,Canada,2021A000011124,"Total, Raw materials price indexes (RMPI)","Index, 202001=100",403,units,0,v1230998135,1.1,40.6,,,,1
1,1981-01,Canada,2021A000011124,Crude energy products [M51],"Index, 202001=100",403,units,0,v1230998136,1.2,31.6,,,,1
2,1981-01,Canada,2021A000011124,Conventional crude oil [14111],"Index, 202001=100",403,units,0,v1230998138,1.4,32.0,,,,1
3,1981-01,Canada,2021A000011124,Natural gas [142],"Index, 202001=100",403,units,0,v1230998141,1.7,79.7,,,,1
4,1981-01,Canada,2021A000011124,Hard coal [14411],"Index, 202001=100",403,units,0,v1230998143,1.9,64.4,,,,1
5,1981-01,Canada,2021A000011124,"Total, excluding crude energy products","Index, 202001=100",403,units,0,v1230998148,1.14,47.6,,,,1
6,1981-01,Canada,2021A000011124,Crop products [M11],"Index, 202001=100",403,units,0,v1230998149,1.15,64.1,,,,1
7,1981-01,Canada,2021A000011124,Wheat [112],"Index, 202001=100",403,units,0,v1230998150,1.16,72.1,,,,1
8,1981-01,Canada,2021A000011124,Canola (including rapeseed) [113],"Index, 202001=100",403,units,0,v1230998151,1.17,60.4,,,,1
9,1981-01,Canada,2021A000011124,Fresh potatoes [11421],"Index, 202001=100",403,units,0,v1230998154,1.2,42.5,,,,1


In [None]:
filter_mat = rawmat[rawmat['REF_DATE'] >= '2001-01']
filter_mat = filter_mat[filter_mat['COORDINATE'] == 1.10]
filter_mat = filter_mat[['REF_DATE', 'VALUE']]
filter_mat = filter_mat.rename(columns = {'REF_DATE': 'date', 'VALUE': 'rmpi'}) 
filter_mat['date'] = pd.to_datetime(filter_mat['date'])
filter_mat['date'] = filter_mat['date'].dt.strftime('%Y-%m')    
filter_mat

In [25]:
tax = pd.read_csv('data/11100058.csv')
tax.head(30)

  tax = pd.read_csv('data/11100058.csv')


Unnamed: 0,REF_DATE,GEO,DGUID,Income percentiles,Family type,Tax and transfer type,Statistics,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,1992,Canada,2016A000011124,All taxfilers,Census families and persons not in census fami...,Federal and provincial income tax and federal ...,Total number of filers,Families,138,units,0,v122807830,1.1.1.1.1,11366920.0,,,,0
1,1992,Canada,2016A000011124,All taxfilers,Census families and persons not in census fami...,Federal and provincial income tax and federal ...,Aggregate taxes / transfers,Dollars,81,units,0,v122807831,1.1.1.1.2,101230200000.0,,,,0
2,1992,Canada,2016A000011124,All taxfilers,Census families and persons not in census fami...,Federal and provincial income tax and federal ...,Aggregate modified total income,Dollars,81,units,0,v122807832,1.1.1.1.3,483085600000.0,,,,0
3,1992,Canada,2016A000011124,All taxfilers,Census families and persons not in census fami...,Federal and provincial income tax and federal ...,Mean effective rate,Percent,239,units,0,v122807833,1.1.1.1.4,14.6,,,,1
4,1992,Canada,2016A000011124,All taxfilers,Census families and persons not in census fami...,Federal and provincial income tax and federal ...,5th percentile rate,Percent,239,units,0,v122807834,1.1.1.1.5,0.0,,,,1
5,1992,Canada,2016A000011124,All taxfilers,Census families and persons not in census fami...,Federal and provincial income tax and federal ...,25th percentile rate,Percent,239,units,0,v122807835,1.1.1.1.6,3.1,,,,1
6,1992,Canada,2016A000011124,All taxfilers,Census families and persons not in census fami...,Federal and provincial income tax and federal ...,50th percentile rate,Percent,239,units,0,v122807836,1.1.1.1.7,16.1,,,,1
7,1992,Canada,2016A000011124,All taxfilers,Census families and persons not in census fami...,Federal and provincial income tax and federal ...,75th percentile rate,Percent,239,units,0,v122807837,1.1.1.1.8,23.5,,,,1
8,1992,Canada,2016A000011124,All taxfilers,Census families and persons not in census fami...,Federal and provincial income tax and federal ...,95th percentile rate,Percent,239,units,0,v122807838,1.1.1.1.9,30.0,,,,1
9,1992,Canada,2016A000011124,All taxfilers,Census families and persons not in census fami...,Federal and provincial income tax and federal ...,Rate of 0,Families,138,units,0,v122807839,1.1.1.1.11,1860085.0,,,,0


In [26]:
filter_tax = tax[tax['COORDINATE'] == '1.1.1.1.4'] # mean effective tax rate
filter_tax = filter_tax[filter_tax['REF_DATE'] >= 2001]
filter_tax.to_csv('tax_rate.csv', index = False)

In [49]:
tax_df = pd.read_csv('data/tax_rate.csv')
tax_df = tax_df[['REF_DATE', 'VALUE']]
tax_df = tax_df.rename(columns = {'REF_DATE': 'date', 'VALUE': 'tax'})
#tax_df['date'] = pd.to_datetime(tax_df['date'])
#tax_df['date'] = tax_df['date'].dt.strftime('%Y')
tax_df

Unnamed: 0,date,tax
0,2001,14.0
1,2002,13.8
2,2003,13.8
3,2004,13.8
4,2005,13.6
5,2006,13.5
6,2007,13.0
7,2008,13.0
8,2009,12.2
9,2010,12.2


In [52]:
prod = pd.read_csv('data/36100206.csv')
prod.head(30)

Unnamed: 0,REF_DATE,GEO,DGUID,Sector,Labour productivity measures and related measures,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,1981-01,Canada,2016A000011124,Business sector,Real gross domestic product (GDP),"Index, 2017=100",373,units,0,v1409154,1.1.2,41.748,,,,3
1,1981-01,Canada,2016A000011124,Business sector,Total number of jobs,"Index, 2017=100",373,units,0,v1409157,1.1.5,61.891,,,,3
2,1981-01,Canada,2016A000011124,Business sector,Average hours worked,"Index, 2017=100",373,units,0,v1409156,1.1.4,107.475,,,,3
3,1981-01,Canada,2016A000011124,Business sector,Hours worked,"Index, 2017=100",373,units,0,v1409155,1.1.3,66.518,,,,3
4,1981-01,Canada,2016A000011124,Business sector,Labour productivity,"Index, 2017=100",373,units,0,v1409153,1.1.1,62.762,,,,3
5,1981-01,Canada,2016A000011124,Business sector,Total compensation per hour worked,"Index, 2017=100",373,units,0,v1409158,1.1.6,26.575,,,,3
6,1981-01,Canada,2016A000011124,Business sector,Unit labour cost,"Index, 2017=100",373,units,0,v1409159,1.1.7,42.343,,,,3
7,1981-01,Canada,2016A000011124,Business sector,Unit labour cost in United States dollars,"Index, 2017=100",373,units,0,v29506127,1.1.14,46.045,,,,3
8,1981-01,Canada,2016A000011124,Business sector,Unit non-labour payments,"Index, 2017=100",373,units,0,v20805659,1.1.8,39.378,,,,3
9,1981-01,Canada,2016A000011124,Business sector,Implicit price deflator,"Index, 2017=100",373,units,0,v20805660,1.1.9,40.926,,,,3


In [12]:
url = 'https://www150.statcan.gc.ca/t1/wds/sdmx/statcan/rest/vector/v41690973?lastNObservations=5&detail=full'

response = requests.get(url)
response

<Response [200]>

In [13]:
# Parse XML
root = ET.fromstring(response.content)

# Print all elements
for child in root:
    print(child.tag, child.text)

{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}Header None
{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}DataSet None


In [14]:
# Parse the XML response
root = ET.fromstring(response.content)

# Define the namespace
namespace = {'ns': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message'}

# Print the contents of the Header
header = root.find('ns:Header', namespace)
if header is not None:
    for elem in header:
        print(f"Header Element: {elem.tag}, Value: {elem.text}")

# Print the contents of the DataSet
dataset = root.find('ns:DataSet', namespace)
if dataset is not None:
    for elem in dataset:
        print(f"DataSet Element: {elem.tag}, Value: {elem.text}")


Header Element: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}ID, Value: DS4085707048327
Header Element: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}Test, Value: false
Header Element: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}Prepared, Value: 2025-01-14T20:12:31
Header Element: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}Sender, Value: None
Header Element: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}Structure, Value: None
DataSet Element: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic}Series, Value: None


In [15]:
def print_elements(element, level=0):
    indent = "  " * level
    print(f"{indent}Tag: {element.tag}, Text: {element.text}")
    for child in element:
        print_elements(child, level + 1)

# Explore the root element
print_elements(root)


Tag: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}GenericData, Text: None
  Tag: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}Header, Text: None
    Tag: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}ID, Text: DS4085707048327
    Tag: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}Test, Text: false
    Tag: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}Prepared, Text: 2025-01-14T20:12:31
    Tag: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}Sender, Text: None
    Tag: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}Structure, Text: None
      Tag: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common}StructureUsage, Text: None
        Tag: Ref, Text: None
  Tag: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}DataSet, Text: None
    Tag: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic}Series, Text: None
      Tag: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/d

In [16]:
with open("response_output.xml", "wb") as file:
    file.write(response.content)

In [19]:
# Define the namespace mappings
namespaces = {
    "message": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message",
    "generic": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic",
    "common": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common",
}

# Parse the XML content
root = ET.fromstring(response.content)  # Replace 'xml_content' with your XML string or response.content

# Extract SeriesKey
series = root.find(".//generic:Series", namespaces)
if series is not None:
    series_key = series.find(".//generic:SeriesKey", namespaces)
    print("Series Key:")
    for value in series_key.findall(".//generic:Value", namespaces):
        print(f"  {value.attrib['id']}: {value.attrib['value']}")

# Extract Attributes
attributes = series.find(".//generic:Attributes", namespaces)
print("\nAttributes:")
for attr in attributes.findall(".//generic:Value", namespaces):
    print(f"  {attr.attrib['id']}: {attr.attrib['value']}")

# Extract Observations
print("\nObservations:")
observations = series.findall(".//generic:Obs", namespaces)
for obs in observations:
    obs_dim = obs.find(".//generic:ObsDimension", namespaces)
    obs_val = obs.find(".//generic:ObsValue", namespaces)
    print(f"  {obs_dim.attrib['value']}: {obs_val.attrib['value']}")


Series Key:
  Geography: 2
  Products_and_product_groups: 2

Attributes:
  VECTOR_ID: 41690973
  SCALAR_FACTOR: 0
  NB_DECIMAL: 1
  DGUID: 2016A000011124
  UOM: 17

Observations:
  2024-07: 162.1
  2024-08: 161.8
  2024-09: 161.1
  2024-10: 161.8
  2024-11: 161.8
