<a href="https://colab.research.google.com/github/FleaBusyBeeBergs/dtsa5506-pipeline/blob/main/dtsa5506-pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# uncomment to install 

# !pip install pandas requests pandasdmx
# !pip install requests-cache 

In [8]:
import zipfile
import pandas as pd
import os
import requests
# import pandasdmx # sdmx api library
# from pandasdmx import Request
# from requests_cache import install_cache
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET

## The Data

The data used comes from the statistics canada data cube (herein referred to as the 'Cube') and is found in the following tables:

* wages, table 14-10-0223-01
* raw materials index, table 18-10-0268-01
* taxes, table 11-10-0058-01
* productivity, table 36-10-0206-01
* consumer price index, table 18-10-004-01

## ETL

There is a very large amount of data in the Cube. Pulling an entire table then paring it down to a few variables is inefficient - create an ETL pipeline.

In [83]:
# create df of tables
tables = {
    'name': ['wage', 'raw', 'tax', 'productivity', 'cpi'],
    'tableid': [14100223, 18100268, 11100058, 36100206, 18100004],
    'vector': ['v79311153', 'v1230998135', 'v122807833', 'v1409153', 'v41690973'],
    'description': ['', '', '', '', ''],
    'frequency': ['monthly', 'monthly', 'annual', 'quarterly', 'monthly'],
    'start': ['YYYY-MM', 'YYYY-MM', 'YYYY', 'YYYY-Qx', 'YYYY-MM'],
    'end': ['YYYY-MM', 'YYYY-MM', 'YYYY', 'YYYY-Qx', 'YYYY-MM'],
    'url': ['', '', '', '', '']
         }
table_df = pd.DataFrame(tables)
print(table_df)

# base_url = 'https://www150.statcan.gc.ca/t1/wds/sdmx/statcan/rest/vector/'

           name   tableid       vector description  frequency    start  \
0          wage  14100223    v79311153                monthly  YYYY-MM   
1           raw  18100268  v1230998135                monthly  YYYY-MM   
2           tax  11100058   v122807833                 annual     YYYY   
3  productivity  36100206     v1409153              quarterly  YYYY-Qx   
4           cpi  18100004    v41690973                monthly  YYYY-MM   

       end url  
0  YYYY-MM      
1  YYYY-MM      
2     YYYY      
3  YYYY-Qx      
4  YYYY-MM      


In [100]:
url = 'https://www150.statcan.gc.ca/t1/wds/sdmx/statcan/rest/vector/v41690973?&detail=full'

response = requests.get(url)
response

<Response [200]>

In [101]:
# Parse XML
root = ET.fromstring(response.content)

# Print all elements
for child in root:
    print(child.tag, child.text)

{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}Header None
{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}DataSet None


In [102]:
# Parse the XML response
root = ET.fromstring(response.content)

# Define the namespace
namespace = {'ns': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message'}

# Print the contents of the Header
header = root.find('ns:Header', namespace)
if header is not None:
    for elem in header:
        print(f"Header Element: {elem.tag}, Value: {elem.text}")

# Print the contents of the DataSet
dataset = root.find('ns:DataSet', namespace)
if dataset is not None:
    for elem in dataset:
        print(f"DataSet Element: {elem.tag}, Value: {elem.text}")


Header Element: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}ID, Value: DS8698764349075
Header Element: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}Test, Value: false
Header Element: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}Prepared, Value: 2025-01-14T22:35:58
Header Element: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}Sender, Value: None
Header Element: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message}Structure, Value: None
DataSet Element: {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic}Series, Value: None


In [103]:
def print_elements(element, level=0):
    indent = "  " * level
    print(f"{indent}Tag: {element.tag}, Text: {element.text}")
    for child in element:
        print_elements(child, level + 1)

# Explore the root element
#print_elements(root)


In [104]:
with open("response_output.xml", "wb") as file:
    file.write(response.content)

In [105]:
# Define the namespace mappings
namespaces = {
    "message": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message",
    "generic": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic",
    "common": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common",
}

# Parse the XML content
root = ET.fromstring(response.content)  # Replace 'xml_content' with your XML string or response.content

# Extract SeriesKey
series = root.find(".//generic:Series", namespaces)
if series is not None:
    series_key = series.find(".//generic:SeriesKey", namespaces)
    print("Series Key:")
    for value in series_key.findall(".//generic:Value", namespaces):
        print(f"  {value.attrib['id']}: {value.attrib['value']}")

# Extract Attributes
attributes = series.find(".//generic:Attributes", namespaces)
print("\nAttributes:")
for attr in attributes.findall(".//generic:Value", namespaces):
    print(f"  {attr.attrib['id']}: {attr.attrib['value']}")

# Extract Observations
print("\nObservations:")
observations = series.findall(".//generic:Obs", namespaces)
for obs in observations:
    obs_dim = obs.find(".//generic:ObsDimension", namespaces)
    obs_val = obs.find(".//generic:ObsValue", namespaces)
    #print(f"  {obs_dim.attrib['value']}: {obs_val.attrib['value']}")


Series Key:
  Geography: 2
  Products_and_product_groups: 2

Attributes:
  VECTOR_ID: 41690973
  SCALAR_FACTOR: 0
  NB_DECIMAL: 1
  DGUID: 2016A000011124
  UOM: 17

Observations:


In [106]:
# Collect observation data
data = []
for obs in observations:
    obs_dim = obs.find(".//generic:ObsDimension", namespaces)
    obs_val = obs.find(".//generic:ObsValue", namespaces)
    data.append({"Date": obs_dim.attrib["value"], "Value": float(obs_val.attrib["value"])})

# Create DataFrame
df = pd.DataFrame(data)
print(df.info())
df.tail(95)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1331 entries, 0 to 1330
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    1331 non-null   object 
 1   Value   1331 non-null   float64
dtypes: float64(1), object(1)
memory usage: 20.9+ KB
None


Unnamed: 0,Date,Value
1236,2017-01,129.5
1237,2017-02,129.7
1238,2017-03,129.9
1239,2017-04,130.4
1240,2017-05,130.5
...,...,...
1326,2024-07,162.1
1327,2024-08,161.8
1328,2024-09,161.1
1329,2024-10,161.8


In [124]:
cpi_df = df.loc[1044:1319,]

cpi_df.head


<bound method NDFrame.head of          Date  Value
1044  2001-01   96.3
1045  2001-02   96.8
1046  2001-03   97.1
1047  2001-04   97.8
1048  2001-05   98.6
...       ...    ...
1315  2023-08  158.7
1316  2023-09  158.5
1317  2023-10  158.6
1318  2023-11  158.8
1319  2023-12  158.3

[276 rows x 2 columns]>

In [125]:
cpi_df.to_csv('cpi.csv', index = False)

## EDA