pip install cdsapi  

This script is helpful in the sense that it helps you understand what data is available and the transformations that occur at each step - note this data should not actually be imported into the database

In [None]:
import os, sys, time, random, zipfile, calendar
import cdsapi  # type: ignore
from dotenv import load_dotenv
import sqlalchemy as sq
import geopandas as gpd  # type: ignore
import xarray as xr  # type: ignore
import pandas as pd
import numpy as np

sys.path.append("../")
from Shared.DataService import DataService  # type: ignore


load_dotenv()
PG_DB = os.getenv("POSTGRES_DB")
PG_ADDR = os.getenv("POSTGRES_ADDR")
PG_PORT = os.getenv("POSTGRES_PORT")
PG_USER = os.getenv("POSTGRES_USER")
PG_PW = os.getenv("POSTGRES_PW")

In [None]:
NUM_WORKERS = 12
REQ_DELAY = 120  # the base delay required to bypass pulling limits
MIN_DELAY = 60  # 1 minute - once added to the required delay, creates a minimum delay of 5 minutes to bypass pulling limits
MAX_DELAY = 180  # 3 minutes - once added to the required delay, creates a maximum delay of 5 minutes to bypass pulling limits
TABLE = "copernicus_satelite_data"

MIN_MONTH = 1
MAX_MONTH = 12

MIN_YEAR = 1995
MAX_YEAR = 2023

years = [
    str(year) for year in range(MIN_YEAR, MAX_YEAR + 1)
]  # the year range we want to pull data from
months = [
    str(month) for month in range(MIN_MONTH, MAX_MONTH + 1)
]  # the month range we want to pull data from

ATTRS = [  # the attributes we want to pull data for
    "2m_dewpoint_temperature",
    "2m_temperature",
    "evaporation_from_bare_soil",
    "skin_reservoir_content",
    "skin_temperature",
    "snowmelt",
    "soil_temperature_level_1",
    "soil_temperature_level_2",
    "soil_temperature_level_3",
    "soil_temperature_level_4",
    "surface_net_solar_radiation",
    "surface_pressure",
    "volumetric_soil_water_layer_1",
    "volumetric_soil_water_layer_2",
    "volumetric_soil_water_layer_3",
    "volumetric_soil_water_layer_4",
    "leaf_area_index_high_vegetation",
    "leaf_area_index_low_vegetation",
]

HOURS = [
    "04:00",
    "15:00",
]  # pulls what is typically considered to be the coldest and warmest hours of the day
AREA = [53, -115, 52, -114]

In [None]:
if (
    PG_DB is None
    or PG_ADDR is None
    or PG_PORT is None
    or PG_USER is None
    or PG_PW is None
):
    raise ValueError("Environment variables not set")

db = DataService(
    PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW
)  # Handles connections to the database
conn = db.connect()  # Connect to the database

query = sq.text("select cr_num, geometry FROM public.census_ag_regions")
agRegions = gpd.GeoDataFrame.from_postgis(
    query, conn, crs="EPSG:3347", geom_col="geometry"
)
db.cleanup()

In [None]:
jobArgs = []  # Holds tuples of arguments for pooled workers
count = 0  # An incrementer used to create unique file names

# Creates the list of arguments (stored as tuples) used in the multiple processes for pullSateliteData(agRegions, year, month, days, outputFile)
for year in years:
    for month in months:
        numDays = calendar.monthrange(int(year), int(month))[
            1
        ]  # Calculates the number of days - stored in index 1 of a tuple
        delay = (count % NUM_WORKERS != 0) * (
            REQ_DELAY * (count % NUM_WORKERS) + random.randint(MIN_DELAY, MAX_DELAY)
        )

        days = [str(day) for day in range(1, numDays + 1)]
        outputFile = f"copernicus{count}"
        count += 1

        jobArgs.append(tuple((agRegions, delay, year, month, days, outputFile)))

In [None]:
agRegions = jobArgs[0][0]
delay = jobArgs[0][1]
year = jobArgs[0][2]
month = jobArgs[0][3]
days = jobArgs[0][4]
outputFile = jobArgs[0][5]

if (
    PG_DB is None
    or PG_ADDR is None
    or PG_PORT is None
    or PG_USER is None
    or PG_PW is None
):
    raise ValueError("Environment variables not set")

db = DataService(PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW)
time.sleep(delay)

print(f"Starting to pull data for {year}/{month}")
conn = db.connect()
c = cdsapi.Client()

c.retrieve(
    "reanalysis-era5-land",
    {
        "format": "netcdf.zip",
        "variable": ATTRS,
        "year": year,
        "month": month,
        "day": days,
        "time": HOURS,
        "area": AREA,
    },
    f"{outputFile}.netcdf.zip",
)

In [None]:
with zipfile.ZipFile(f"{outputFile}.netcdf.zip", "r") as zip_ref:  # Opens the zip file
    zipinfos = (
        zip_ref.infolist()
    )  # Collects the information of each file contained within

    for zipinfo in zipinfos:  # For each file in the zip file (we only expect one)
        zipinfo.filename = (
            outputFile  # Changes the unzipped files name (once its unzipped of course)
        )
        zip_ref.extract(zipinfo)  # Unzips the file
        break

In [None]:
dataset = xr.open_dataset(outputFile)  # Loads the dataset from the netcdf file
df = (
    dataset.to_dataframe().reset_index()
)  # Converts the contents into a dataframe and corrects indexes

dataset.close()

In [None]:
df["year"] = None
df["month"] = None
df["day"] = None
df["hour"] = None

# Renames the dataframes columns so it can be matched when its posted to the database
df.rename(columns={df.columns[0]: "lon"}, inplace=True)
df.rename(columns={df.columns[1]: "lat"}, inplace=True)
df.rename(columns={df.columns[2]: "datetime"}, inplace=True)
df.rename(columns={df.columns[3]: "dewpoint_temperature"}, inplace=True)
df.rename(columns={df.columns[4]: "temperature"}, inplace=True)
df.rename(columns={df.columns[5]: "evaporation_from_bare_soil"}, inplace=True)
df.rename(columns={df.columns[6]: "skin_reservoir_content"}, inplace=True)
df.rename(columns={df.columns[7]: "skin_temperature"}, inplace=True)
df.rename(columns={df.columns[8]: "snowmelt"}, inplace=True)
df.rename(columns={df.columns[9]: "soil_temperature_level_1"}, inplace=True)
df.rename(columns={df.columns[10]: "soil_temperature_level_2"}, inplace=True)
df.rename(columns={df.columns[11]: "soil_temperature_level_3"}, inplace=True)
df.rename(columns={df.columns[12]: "soil_temperature_level_4"}, inplace=True)
df.rename(columns={df.columns[13]: "surface_net_solar_radiation"}, inplace=True)
df.rename(columns={df.columns[14]: "surface_pressure"}, inplace=True)
df.rename(columns={df.columns[15]: "volumetric_soil_water_layer_1"}, inplace=True)
df.rename(columns={df.columns[16]: "volumetric_soil_water_layer_2"}, inplace=True)
df.rename(columns={df.columns[17]: "volumetric_soil_water_layer_3"}, inplace=True)
df.rename(columns={df.columns[18]: "volumetric_soil_water_layer_4"}, inplace=True)
df.rename(columns={df.columns[19]: "leaf_area_index_high_vegetation"}, inplace=True)
df.rename(columns={df.columns[20]: "leaf_area_index_low_vegetation"}, inplace=True)

# Used to detect null values - na.mask, null etc... will be replaced with nan which get removed immediately after
df[
    [
        "lon",
        "lat",
        "dewpoint_temperature",
        "temperature",
        "evaporation_from_bare_soil",
        "skin_reservoir_content",
        "skin_temperature",
        "snowmelt",
        "soil_temperature_level_1",
        "soil_temperature_level_2",
        "soil_temperature_level_3",
        "soil_temperature_level_4",
        "surface_net_solar_radiation",
        "surface_pressure",
        "volumetric_soil_water_layer_1",
        "volumetric_soil_water_layer_2",
        "volumetric_soil_water_layer_3",
        "volumetric_soil_water_layer_4",
        "leaf_area_index_high_vegetation",
        "leaf_area_index_low_vegetation",
    ]
] = df[
    [
        "lon",
        "lat",
        "dewpoint_temperature",
        "temperature",
        "evaporation_from_bare_soil",
        "skin_reservoir_content",
        "skin_temperature",
        "snowmelt",
        "soil_temperature_level_1",
        "soil_temperature_level_2",
        "soil_temperature_level_3",
        "soil_temperature_level_4",
        "surface_net_solar_radiation",
        "surface_pressure",
        "volumetric_soil_water_layer_1",
        "volumetric_soil_water_layer_2",
        "volumetric_soil_water_layer_3",
        "volumetric_soil_water_layer_4",
        "leaf_area_index_high_vegetation",
        "leaf_area_index_low_vegetation",
    ]
].astype(
    float
)

df = df.replace(np.nan, None)

In [None]:
df

In [None]:
df = gpd.GeoDataFrame(
    df, crs="EPSG:4326", geometry=gpd.points_from_xy(df.lon, df.lat)
)  # Creates geometry from df using lon and lat as cords to create points (points being geometry)
df = df.to_crs(  # type: ignore
    crs="EPSG:3347", inplace=True
)  # Changes the points projection to match the agriculture regions of EPSG:3347
df = gpd.sjoin(
    df, agRegions, how="inner", predicate="within"
)  # Join the two dataframes based on which points fit within what agriculture regions

In [None]:
df.drop(columns=["geometry", "index_right"], inplace=True)
df[["cr_num"]] = df[["cr_num"]].astype(int)

In [None]:
for index in range(len(df.index)):
    date = pd.Timestamp(np.datetime64(df.at[index, "datetime"]))
    df.at[index, "year"] = date.year
    df.at[index, "month"] = date.month
    df.at[index, "day"] = date.day
    df.at[index, "hour"] = date.hour

In [None]:
df.to_sql(TABLE, conn, schema="public", if_exists="append", index=False)

In [None]:
print(f"[SUCCESS] data was pulled for {year}/{month}")
db.cleanup()