<a href="https://colab.research.google.com/github/FleaBusyBeeBergs/dtsa5506-pipeline/blob/main/dtsa5506-pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# uncomment to install 

# !pip install pandas requests pandasdmx
# !pip install requests-cache 

In [1]:
import zipfile
import pandas as pd
import os
import requests
# import pandasdmx # sdmx api library
# from pandasdmx import Request
# from requests_cache import install_cache
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET

## The Data

The data used comes from the statistics canada data cube (herein referred to as the 'Cube') and is found in the following tables:

* wages, table 14-10-0223-01
* raw materials index, table 18-10-0268-01
* taxes, table 11-10-0058-01
* productivity, table 36-10-0206-01
* consumer price index, table 18-10-004-01

## ETL

There is a very large amount of data in the Cube. Pulling an entire table then paring it down to a few variables is inefficient - create an ETL pipeline.

In [None]:
# create df of tables
tables = {
    'name': ['wage', 'raw', 'tax', 'productivity', 'cpi'],
    'tableid': [14100223, 18100268, 11100058, 36100206, 18100004],
    'vector': ['v79311153', 'v1230998135', 'v122807833', 'v1409153', 'v41690973'],
    'description': ['', '', '', '', ''],
    'frequency': ['monthly', 'monthly', 'annual', 'quarterly', 'monthly'],
    'start': ['YYYY-MM', 'YYYY-MM', 'YYYY', 'YYYY-Qx', 'YYYY-MM'],
    'end': ['YYYY-MM', 'YYYY-MM', 'YYYY', 'YYYY-Qx', 'YYYY-MM'],
    'url': ['', '', '', '', '']
         }
table_df = pd.DataFrame(tables)
print(table_df)

# base_url = 'https://www150.statcan.gc.ca/t1/wds/sdmx/statcan/rest/vector/'

In [None]:
url = 'https://www150.statcan.gc.ca/t1/wds/sdmx/statcan/rest/vector/v41690973?&detail=full'

response = requests.get(url)
response

In [None]:
# Parse XML
root = ET.fromstring(response.content)

# Print all elements
for child in root:
    print(child.tag, child.text)

In [None]:
# Parse the XML response
root = ET.fromstring(response.content)

# Define the namespace
namespace = {'ns': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message'}

# Print the contents of the Header
header = root.find('ns:Header', namespace)
if header is not None:
    for elem in header:
        print(f"Header Element: {elem.tag}, Value: {elem.text}")

# Print the contents of the DataSet
dataset = root.find('ns:DataSet', namespace)
if dataset is not None:
    for elem in dataset:
        print(f"DataSet Element: {elem.tag}, Value: {elem.text}")


In [None]:
def print_elements(element, level=0):
    indent = "  " * level
    print(f"{indent}Tag: {element.tag}, Text: {element.text}")
    for child in element:
        print_elements(child, level + 1)

# Explore the root element
#print_elements(root)


In [None]:
with open("response_output.xml", "wb") as file:
    file.write(response.content)

In [None]:
# Define the namespace mappings
namespaces = {
    "message": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message",
    "generic": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic",
    "common": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common",
}

# Parse the XML content
root = ET.fromstring(response.content)  # Replace 'xml_content' with your XML string or response.content

# Extract SeriesKey
series = root.find(".//generic:Series", namespaces)
if series is not None:
    series_key = series.find(".//generic:SeriesKey", namespaces)
    print("Series Key:")
    for value in series_key.findall(".//generic:Value", namespaces):
        print(f"  {value.attrib['id']}: {value.attrib['value']}")

# Extract Attributes
attributes = series.find(".//generic:Attributes", namespaces)
print("\nAttributes:")
for attr in attributes.findall(".//generic:Value", namespaces):
    print(f"  {attr.attrib['id']}: {attr.attrib['value']}")

# Extract Observations
print("\nObservations:")
observations = series.findall(".//generic:Obs", namespaces)
for obs in observations:
    obs_dim = obs.find(".//generic:ObsDimension", namespaces)
    obs_val = obs.find(".//generic:ObsValue", namespaces)
    #print(f"  {obs_dim.attrib['value']}: {obs_val.attrib['value']}")


In [None]:
# Collect observation data
data = []
for obs in observations:
    obs_dim = obs.find(".//generic:ObsDimension", namespaces)
    obs_val = obs.find(".//generic:ObsValue", namespaces)
    data.append({"Date": obs_dim.attrib["value"], "Value": float(obs_val.attrib["value"])})

# Create DataFrame
df = pd.DataFrame(data)
print(df.info())
df.tail(95)

In [None]:
cpi_df = df.loc[1044:1319,]

cpi_df.head


In [None]:
cpi_df.to_csv('cpi.csv', index = False)

### Cleaning and wrangling

In [None]:
wage = pd.read_csv('data/wage.csv')
wage.rename(columns = {'Value': 'wage'}, inplace = True)
tax = pd.read_csv('data/tax.csv')
tax.rename(columns = {'Value': 'tax'}, inplace = True)
rawmat = pd.read_csv('data/rawmat.csv')
rawmat.rename(columns = {'Value': 'rawmat'}, inplace = True)
prod = pd.read_csv('data/prod.csv')
prod.rename(columns = {'Value': 'prod'}, inplace = True)
cpi = pd.read_csv('data/cpi.csv')
cpi.rename(columns = {'Value': 'cpi'}, inplace = True)

In [None]:
df = (pd.merge(wage, rawmat,
               on = 'Date', 
               how = 'outer'))
df = (pd.merge(df, cpi,
               on = 'Date', 
               how = 'outer')) 
df

In [None]:
prod['Date'] = pd.to_datetime(prod['Date'].str[:4] + '-' +
                              prod['Date'].str[-2:].replace({'Q1': '01', 'Q2': '04', 'Q3': '07', 'Q4': '10'}))


In [None]:
prod['Date'] = prod['Date'].dt.strftime('%Y-%m')

In [None]:
df = (pd.merge(df, prod, 
               on='Date', 
               how ='outer'))


In [None]:
df['prod'] = df['prod'].ffill()

In [None]:
tax['Date'] = pd.to_datetime(tax['Date'])

In [None]:
tax

In [None]:
df

## Train-test-split

## EDA

* Create for loop to visualize series, line plot (interactive?)
* Correlation grid
* 

## Preprocessor
define steps so they are applied equally to both training and testing sets.