In [1]:
import os
from datetime import datetime
import requests
import pandas as pd
import zipfile
import io
import gc

def fetch_statcan_data(product_id):
    """
    Retrieves the full table (as a zipped CSV) from Statistics Canada using the getFullTableDownloadCSV endpoint.
    Appends '/en' to the product_id to request the English CSV version.
    """
    url = f"https://www150.statcan.gc.ca/t1/wds/rest/getFullTableDownloadCSV/{product_id}/en"
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()
    if data.get("status") != "SUCCESS":
        raise ValueError(f"Error retrieving data for product id {product_id}: {data}")
    download_url = data.get("object")
    print(f"Downloading CSV from: {download_url}")
    
    # Download the ZIP file containing the CSV
    zip_response = requests.get(download_url)
    zip_response.raise_for_status()
    
    # Extract the CSV from the ZIP file (assumes one CSV file in the archive)
    with zipfile.ZipFile(io.BytesIO(zip_response.content)) as z:
        csv_filename = z.namelist()[0]
        # Use low_memory=False to avoid dtype warnings and possible issues
        df = pd.read_csv(z.open(csv_filename), low_memory=False)
    return df

# --------------------------------------------------------------------------
# Customize these product IDs with the ones from your cubes CSV (full table download)
# --------------------------------------------------------------------------
product_id_gdp = "14100288"   
product_id_emp = "14100287"   
product_id_cpi = "14100289"   

# --------------------------------------------------------------------------
# Retrieve each dataset
# --------------------------------------------------------------------------
try:
    df_gdp = fetch_statcan_data(product_id_gdp)
    print(f"GDP data shape (raw): {df_gdp.shape}")
    # Filter for Canada and keep only the needed columns
    df_gdp = df_gdp[df_gdp['GEO'] == 'Canada']
    df_gdp = df_gdp[['REF_DATE', 'VALUE']].rename(columns={'REF_DATE': 'date', 'VALUE': 'GDP'})
    # Group by date to avoid duplicates blowing up merges
    df_gdp = df_gdp.groupby('date', as_index=False)['GDP'].mean()
    # Convert to numeric (downcast to save memory)
    df_gdp['GDP'] = pd.to_numeric(df_gdp['GDP'], errors='coerce', downcast='float')
    df_gdp['date'] = pd.to_datetime(df_gdp['date'], errors='coerce')
    print(f"GDP data shape (filtered & grouped): {df_gdp.shape}")
except Exception as e:
    print(f"Error retrieving GDP data (product {product_id_gdp}): {e}")

try:
    df_emp = fetch_statcan_data(product_id_emp)
    print(f"Employment data shape (raw): {df_emp.shape}")
    df_emp = df_emp[df_emp['GEO'] == 'Canada']
    df_emp = df_emp[['REF_DATE', 'VALUE']].rename(columns={'REF_DATE': 'date', 'VALUE': 'Employment'})
    df_emp = df_emp.groupby('date', as_index=False)['Employment'].mean()
    df_emp['Employment'] = pd.to_numeric(df_emp['Employment'], errors='coerce', downcast='float')
    df_emp['date'] = pd.to_datetime(df_emp['date'], errors='coerce')
    print(f"Employment data shape (filtered & grouped): {df_emp.shape}")
except Exception as e:
    print(f"Error retrieving Employment data (product {product_id_emp}): {e}")

try:
    df_cpi = fetch_statcan_data(product_id_cpi)
    print(f"CPI data shape (raw): {df_cpi.shape}")
    df_cpi = df_cpi[df_cpi['GEO'] == 'Canada']
    df_cpi = df_cpi[['REF_DATE', 'VALUE']].rename(columns={'REF_DATE': 'date', 'VALUE': 'CPI'})
    df_cpi = df_cpi.groupby('date', as_index=False)['CPI'].mean()
    df_cpi['CPI'] = pd.to_numeric(df_cpi['CPI'], errors='coerce', downcast='float')
    df_cpi['date'] = pd.to_datetime(df_cpi['date'], errors='coerce')
    print(f"CPI data shape (filtered & grouped): {df_cpi.shape}")
except Exception as e:
    print(f"Error retrieving CPI data (product {product_id_cpi}): {e}")

# --------------------------------------------------------------------------
# Merge the datasets
# --------------------------------------------------------------------------
try:
    # Merge GDP and Employment with an inner join to avoid massive expansions
    print("Merging GDP and Employment...")
    df_temp = pd.merge(df_gdp, df_emp, on='date', how='inner')
    print(f"Shape after GDP+Employment merge: {df_temp.shape}")

    # Merge with CPI
    print("Merging with CPI...")
    df_merged = pd.merge(df_temp, df_cpi, on='date', how='inner')
    print(f"Final shape after merging CPI: {df_merged.shape}")

    df_merged.sort_values(by='date', inplace=True)
    df_merged.reset_index(drop=True, inplace=True)

    # Créer le dossier 'macroeconomics' s'il n'existe pas
    output_folder = "macroeconomics"
    os.makedirs(output_folder, exist_ok=True)

    # Récupérer la date d'aujourd'hui au format AAAAMMJJ
    today = datetime.today().strftime('%Y%m%d')

    # Construire le nom de fichier en ajoutant la date avant l'extension
    output_file = os.path.join(output_folder, f"canada_macro_data_{today}.csv")
    df_merged.to_csv(output_file, index=False, encoding='utf-8')
    print(f"Merged data saved to '{output_file}'")

    # Cleanup
    del df_temp, df_merged
    gc.collect()
except Exception as e:
    print(f"Error merging datasets: {e}")

Error retrieving GDP data (product 14100288): HTTPSConnectionPool(host='www150.statcan.gc.ca', port=443): Max retries exceeded with url: /t1/wds/rest/getFullTableDownloadCSV/14100288/en (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1017)')))
Error retrieving Employment data (product 14100287): HTTPSConnectionPool(host='www150.statcan.gc.ca', port=443): Max retries exceeded with url: /t1/wds/rest/getFullTableDownloadCSV/14100287/en (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1017)')))
Error retrieving CPI data (product 14100289): HTTPSConnectionPool(host='www150.statcan.gc.ca', port=443): Max retries exceeded with url: /t1/wds/rest/getFullTableDownloadCSV/14100289/en (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify fa