The goal for this part of the project is to use automation to download all the possible data from https://climate.weather.gc.ca/

We aim to download the daily data. We first find all the possible weather stations. The station ids are available from the "more data" part.
https://collaboration.cmc.ec.gc.ca/cmc/climate/Get_More_Data_Plus_de_donnees/
We can get the "Station Inventory EN.csv" file from there, which contains all the station ids.

In [None]:
import pandas as pd

# Get the station inventory
stations = pd.read_csv("Station Inventory EN.csv", skiprows=3)

Since we have got the station inventory, we can now download the data for each station.
Note that from the station inventory, we need three columns: "Station ID", "First Year", and "Last Year".

In [6]:
import time
import pathlib
import requests
import pandas as pd
from urllib.parse import urlencode

BASE = "https://climate.weather.gc.ca/climate_data/bulk_data_e.html"

# Function to build the url for downloading data
def build_daily_url(station_id: int, year: int) -> str:
    params = {
        "format": "csv",
        "stationID": str(int(station_id)),
        "Year": str(int(year)),
        "Month": "1",
        "Day": "1",
        "timeframe": "2",      # 2 = daily
        "submit": "Download Data",
    }
    return f"{BASE}?{urlencode(params)}"

# Function to download a single csv data from a given url (fixed year and station id)
def download_csv(url: str, out_path: pathlib.Path, timeout=60):
    headers = {"User-Agent": "course-datasci-downloader/1.0"}

    # Stop if timeout or other error occurs
    r = requests.get(url, headers=headers, timeout=timeout)
    r.raise_for_status()

    # Check if the response is a CSV file
    head = r.text[:200].lower()
    if "<html" in head or "before your query can be processed" in head:
        raise ValueError("Not a CSV response")
    
    # Save the content to the output path
    out_path.parent.mkdir(parents=True, exist_ok=True)
    out_path.write_bytes(r.content)

# Function to loop over all stations and years to download data
def bulk_download_daily(stations_df: pd.DataFrame, out_dir="data/daily", sleep_sec=0.5):
    out_dir = pathlib.Path(out_dir)

    # Clean the station dataframe
    df = stations_df.copy()
    df["Station ID"] = pd.to_numeric(df["Station ID"], errors="coerce").astype("Int64")
    df["First Year"] = pd.to_numeric(df["First Year"], errors="coerce").astype("Int64")
    df["Last Year"] = pd.to_numeric(df["Last Year"], errors="coerce").astype("Int64")
    df = df.dropna(subset=["Station ID", "First Year", "Last Year"])

    # Loop over each station
    for _, row in df.iterrows():
        sid = int(row["Station ID"])
        y0 = int(row["First Year"])
        y1 = int(row["Last Year"])

        # Loop over each year for the station
        for y in range(y0, y1 + 1):
            out_path = out_dir / f"station{sid}" / f"daily_{y}.csv"
            if out_path.exists(): # skip existing files
                continue
            
            url = build_daily_url(sid, y)
            try:
                download_csv(url, out_path)
                print(f"OK  station={sid} year={y}")
            except Exception as e:
                print(f"FAIL station={sid} year={y} err={e}")
            time.sleep(sleep_sec) # be polite and avoid overwhelming the server


In [7]:
# Test with a small number of stations
testing_df = stations.head(5)
bulk_download_daily(testing_df, out_dir="data/daily_test", sleep_sec=1)

OK  station=14 year=1984
OK  station=14 year=1985
OK  station=14 year=1986
OK  station=14 year=1987
OK  station=14 year=1988
OK  station=14 year=1989
OK  station=14 year=1990
OK  station=14 year=1991
OK  station=14 year=1992
OK  station=14 year=1993
OK  station=14 year=1994
OK  station=14 year=1995
OK  station=14 year=1996
OK  station=15 year=1971
OK  station=15 year=1972
OK  station=15 year=1973
OK  station=15 year=1974
OK  station=15 year=1975
OK  station=15 year=1976
OK  station=15 year=1977
OK  station=15 year=1978
OK  station=15 year=1979
OK  station=15 year=1980
OK  station=15 year=1981
OK  station=15 year=1982
OK  station=15 year=1983
OK  station=15 year=1984
OK  station=15 year=1985
OK  station=15 year=1986
OK  station=15 year=1987
OK  station=15 year=1988
OK  station=15 year=1989
OK  station=15 year=1990
OK  station=15 year=1991
OK  station=15 year=1992
OK  station=15 year=1993
OK  station=15 year=1994
OK  station=15 year=1995
OK  station=16 year=1961
OK  station=16 year=1962


In [None]:
# Download all stations (uncomment to run)
# bulk_download_daily(stations, out_dir="data/daily", sleep_sec=1.5)

OK  station=14 year=1984
OK  station=14 year=1985
OK  station=14 year=1986
OK  station=14 year=1987
OK  station=14 year=1988
OK  station=14 year=1989
OK  station=14 year=1990
OK  station=14 year=1991
OK  station=14 year=1992
OK  station=14 year=1993
OK  station=14 year=1994
OK  station=14 year=1995
OK  station=14 year=1996
OK  station=15 year=1971
OK  station=15 year=1972
OK  station=15 year=1973
OK  station=15 year=1974
OK  station=15 year=1975
OK  station=15 year=1976
OK  station=15 year=1977
OK  station=15 year=1978
OK  station=15 year=1979
OK  station=15 year=1980
OK  station=15 year=1981
OK  station=15 year=1982
OK  station=15 year=1983
OK  station=15 year=1984
OK  station=15 year=1985
OK  station=15 year=1986
OK  station=15 year=1987
OK  station=15 year=1988
OK  station=15 year=1989
OK  station=15 year=1990
OK  station=15 year=1991
OK  station=15 year=1992
OK  station=15 year=1993
OK  station=15 year=1994
OK  station=15 year=1995
OK  station=16 year=1961
OK  station=16 year=1962


KeyboardInterrupt: 

We have use the single download function to get the first 87 stations' data. But this method is too slow. Now we limit the date range to 2000-2026 to speed up the process.

In [None]:
stations_new = stations # [86:]  # start from station 94 (#87)
stations_new = stations_new.copy()

# Limit the year range to 2000-2026
stations_new["First Year"] = stations_new["First Year"].apply(lambda x: max(x, 2000))
# Remove any stations that end before 2000
stations_new = stations_new[stations_new["Last Year"] >= 2000]

bulk_download_daily(stations_new, out_dir="data/daily_limited", sleep_sec=0.1)

OK  station=78 year=2000
OK  station=78 year=2001
OK  station=78 year=2002
OK  station=78 year=2003
OK  station=78 year=2004
OK  station=78 year=2005
OK  station=78 year=2006
OK  station=78 year=2007
OK  station=78 year=2008
OK  station=78 year=2009
OK  station=78 year=2010
OK  station=78 year=2011
OK  station=78 year=2012
OK  station=78 year=2013
OK  station=78 year=2014
OK  station=78 year=2015
OK  station=78 year=2016
OK  station=78 year=2017
OK  station=78 year=2018
OK  station=78 year=2019
OK  station=78 year=2020
OK  station=78 year=2021
OK  station=78 year=2022
OK  station=78 year=2023
OK  station=78 year=2024
OK  station=78 year=2025
OK  station=78 year=2026
OK  station=95 year=2000
OK  station=95 year=2001
OK  station=95 year=2002
OK  station=95 year=2003
OK  station=95 year=2004
OK  station=95 year=2005
OK  station=95 year=2006
OK  station=95 year=2007
OK  station=95 year=2008
OK  station=95 year=2009
OK  station=95 year=2010
OK  station=95 year=2011
OK  station=95 year=2012


All of the data have been downloaded and stored in the "data/daily_limited" folder. We now merge all the data into a single csv file for further analysis.

In [17]:
from pathlib import Path
import csv
import re

INPUT_ROOT = Path("data/daily_limited")
OUTPUT_FILE = Path("all_daily.csv")

# Extract station id from the path
def extract_station_id(path: Path) -> int:
    # For example, from "data/daily/station123/daily_2020.csv" extract 123
    m = re.search(r"station(\d+)", str(path.parent))
    if not m:
        raise ValueError(f"Cannot parse station id from path: {path}")
    return int(m.group(1))

# Find all daily_*.csv files
files = sorted(INPUT_ROOT.rglob("daily_*.csv"))
if not files:
    raise RuntimeError(f"No files found under {INPUT_ROOT}")

written_header = False
need_add_station_id = False

# Open the output file for writing
with OUTPUT_FILE.open("w", newline="", encoding="utf-8") as fout:
    writer = csv.writer(fout)

    # Loop over each input file
    for idx, f in enumerate(files, 1):
        sid = extract_station_id(f)

        # Reader for the input file
        with f.open("r", newline="", encoding="utf-8") as fin:
            reader = csv.reader(fin)

            # Read the header
            try:
                header = next(reader)
            except StopIteration:
                continue # Skip empty files
            
            # Clean BOM if present
            if header and header[0].startswith("\ufeff"):
                header[0] = header[0].lstrip("\ufeff")

            # Write the header if not written (only once)
            if not written_header:
                need_add_station_id = ("Station ID" not in header)
                if need_add_station_id:
                    header = ["Station ID"] + header
                writer.writerow(header)
                written_header = True

            # Write the data rows
            for row in reader:
                if not row: # skip empty rows
                    continue
                if need_add_station_id: # add station id to each row
                    row = [sid] + row
                writer.writerow(row)        

            if idx % 2000 == 0: # print progress every 2000 files
                print(f"Processed {idx}/{len(files)} files...")

print(f"Done. Output: {OUTPUT_FILE.resolve()}")       

Processed 2000/51708 files...
Processed 4000/51708 files...
Processed 6000/51708 files...
Processed 8000/51708 files...
Processed 10000/51708 files...
Processed 12000/51708 files...
Processed 14000/51708 files...
Processed 16000/51708 files...
Processed 18000/51708 files...
Processed 20000/51708 files...
Processed 22000/51708 files...
Processed 24000/51708 files...
Processed 26000/51708 files...
Processed 28000/51708 files...
Processed 30000/51708 files...
Processed 32000/51708 files...
Processed 34000/51708 files...
Processed 36000/51708 files...
Processed 38000/51708 files...
Processed 40000/51708 files...
Processed 42000/51708 files...
Processed 44000/51708 files...
Processed 46000/51708 files...
Processed 48000/51708 files...
Processed 50000/51708 files...
Done. Output: C:\Users\bchen\OneDrive - University of Waterloo\University of Waterloo\M2\STAT 946\Case 1\all_daily.csv
