pip install cdsapi  

In [65]:
import os, sys, time, cdsapi, random, zipfile, calendar
from dotenv import load_dotenv
import sqlalchemy as sq 
import geopandas as gpd
import xarray as xr
import pandas as pd
import numpy as np

sys.path.append('../')
from DataService import DataService


load_dotenv()
PG_DB = os.getenv('POSTGRES_DB')
PG_ADDR = os.getenv('POSTGRES_ADDR')
PG_PORT = os.getenv('POSTGRES_PORT')
PG_USER = os.getenv('POSTGRES_USER')
PG_PW = os.getenv('POSTGRES_PW')

In [52]:
NUM_WORKERS = 12
REQ_DELAY = 120                     # the base delay required to bypass pulling limits
MIN_DELAY = 60                      # 1 minute - once added to the required delay, creates a minimum delay of 5 minutes to bypass pulling limits
MAX_DELAY = 180                     # 3 minutes - once added to the required delay, creates a maximum delay of 5 minutes to bypass pulling limits
TABLE = 'copernicus_satelite_data'

MIN_MONTH = 1
MAX_MONTH = 12

MIN_YEAR = 1995
MAX_YEAR = 2023

years = [str(year) for year in range(MIN_YEAR, MAX_YEAR + 1)]       # the year range we want to pull data from
months = [str(month) for month in range(MIN_MONTH, MAX_MONTH + 1)]  # the month range we want to pull data from

ATTRS = [                                                           # the attributes we want to pull data for
    '2m_dewpoint_temperature', '2m_temperature', 'evaporation_from_bare_soil', 'skin_reservoir_content', 'skin_temperature',
    'snowmelt', 'soil_temperature_level_1', 'soil_temperature_level_2', 'soil_temperature_level_3', 'soil_temperature_level_4',
    'surface_net_solar_radiation', 'surface_pressure', 'volumetric_soil_water_layer_1', 'volumetric_soil_water_layer_2', 
    'volumetric_soil_water_layer_3', 'volumetric_soil_water_layer_4', 'leaf_area_index_high_vegetation', 'leaf_area_index_low_vegetation'
]

HOURS = ['04:00', '15:00']  # pulls what is typically considered to be the coldest and warmest hours of the day
AREA = [53, -115, 52, -114]

In [66]:
db = DataService(PG_DB, PG_ADDR, PG_PORT, PG_USER, PG_PW)       # Handles connections to the database    
conn = db.connect()                                             # Connect to the database

query = sq.text('select cr_num, geometry FROM public.census_ag_regions')
agRegions = gpd.GeoDataFrame.from_postgis(query, conn, crs='EPSG:3347', geom_col='geometry')
db.cleanup()

In [75]:
jobArgs = []                                                    # Holds tuples of arguments for pooled workers
count = 0                                                       # An incrementer used to create unique file names

# Creates the list of arguments (stored as tuples) used in the multiple processes for pullSateliteData(agRegions, year, month, days, outputFile)
for year in years:
    for month in months:
        numDays = calendar.monthrange(int(year), int(month))[1] # Calculates the number of days - stored in index 1 of a tuple
        delay = (count%NUM_WORKERS != 0)*(REQ_DELAY*(count%NUM_WORKERS) + random.randint(MIN_DELAY, MAX_DELAY))

        days = [str(day) for day in range(1, numDays + 1)]
        outputFile = f'copernicus{count}'
        count += 1

        jobArgs.append(tuple((agRegions, delay, year, month, days, outputFile)))

In [15]:
agRegions = jobArgs[0][0]
delay = jobArgs[0][1]
year = jobArgs[0][2]
month = jobArgs[0][3]
days = jobArgs[0][4]
outputFile = jobArgs[0][5]

db = DataService(PG_DB, PG_ADDR, PG_PORT, PG_USER, PG_PW)
time.sleep(delay)

print(f'Starting to pull data for {year}/{month}')
conn = db.connect()
c = cdsapi.Client()

c.retrieve(
        'reanalysis-era5-land',
        {
        'format': 'netcdf.zip',
        'variable': ATTRS,
        'year': year,
        'month': month,
        'day': days,
        'time': HOURS,
        'area': AREA,
    },
    f'{outputFile}.netcdf.zip'
)

Starting to pull data for 1995/1


2023-06-16 23:45:27,843 INFO Welcome to the CDS
2023-06-16 23:45:27,844 INFO Sending request to https://cds.climate.copernicus.eu/api/v2/resources/reanalysis-era5-land
2023-06-16 23:45:28,075 INFO Request is queued
2023-06-16 23:45:29,262 INFO Request is running
2023-06-16 23:48:21,174 INFO Request is completed
2023-06-16 23:48:21,174 INFO Downloading https://download-0015-clone.copernicus-climate.eu/cache-compute-0015/cache/data0/adaptor.mars.internal-1686977289.4974449-13284-17-72b552f4-0780-4ad8-9055-6b3453f4649d.zip to copernicus1.netcdf.zip (197.1K)
2023-06-16 23:48:24,457 INFO Download rate 60.1K/s 


Result(content_length=201860,content_type=application/zip,location=https://download-0015-clone.copernicus-climate.eu/cache-compute-0015/cache/data0/adaptor.mars.internal-1686977289.4974449-13284-17-72b552f4-0780-4ad8-9055-6b3453f4649d.zip)

In [16]:
with zipfile.ZipFile(f'{outputFile}.netcdf.zip', 'r') as zip_ref:     # Opens the zip file
    zipinfos = zip_ref.infolist()                               # Collects the information of each file contained within

    for zipinfo in zipinfos:            # For each file in the zip file (we only expect one)
        zipinfo.filename = outputFile         # Changes the unzipped files name (once its unzipped of course)
        zip_ref.extract(zipinfo)        # Unzips the file
        break

In [25]:
dataset = xr.open_dataset(outputFile)       # Loads the dataset from the netcdf file
df = dataset.to_dataframe().reset_index()   # Converts the contents into a dataframe and corrects indexes

dataset.close()

In [26]:
df['year'] = None
df['month'] = None
df['day'] = None
df['hour'] = None

# Renames the dataframes columns so it can be matched when its posted to the database
df.rename(columns={df.columns[0]: 'lon'}, inplace=True)
df.rename(columns={df.columns[1]: 'lat'}, inplace=True)
df.rename(columns={df.columns[2]: 'datetime'}, inplace=True)
df.rename(columns={df.columns[3]: 'dewpoint_temperature'}, inplace=True)
df.rename(columns={df.columns[4]: 'temperature'}, inplace=True)
df.rename(columns={df.columns[5]: 'evaporation_from_bare_soil'}, inplace=True)
df.rename(columns={df.columns[6]: 'skin_reservoir_content'}, inplace=True)
df.rename(columns={df.columns[7]: 'skin_temperature'}, inplace=True)
df.rename(columns={df.columns[8]: 'snowmelt'}, inplace=True)
df.rename(columns={df.columns[9]: 'soil_temperature_level_1'}, inplace=True)
df.rename(columns={df.columns[10]: 'soil_temperature_level_2'}, inplace=True)
df.rename(columns={df.columns[11]: 'soil_temperature_level_3'}, inplace=True)
df.rename(columns={df.columns[12]: 'soil_temperature_level_4'}, inplace=True)
df.rename(columns={df.columns[13]: 'surface_net_solar_radiation'}, inplace=True)
df.rename(columns={df.columns[14]: 'surface_pressure'}, inplace=True)
df.rename(columns={df.columns[15]: 'volumetric_soil_water_layer_1'}, inplace=True)
df.rename(columns={df.columns[16]: 'volumetric_soil_water_layer_2'}, inplace=True)
df.rename(columns={df.columns[17]: 'volumetric_soil_water_layer_3'}, inplace=True)
df.rename(columns={df.columns[18]: 'volumetric_soil_water_layer_4'}, inplace=True)
df.rename(columns={df.columns[19]: 'leaf_area_index_high_vegetation'}, inplace=True)
df.rename(columns={df.columns[20]: 'leaf_area_index_low_vegetation'}, inplace=True)

# Used to detect null values - na.mask, null etc... will be replaced with nan which get removed immediately after
df[['lon', 'lat', 'dewpoint_temperature', 'temperature', 'evaporation_from_bare_soil', 'skin_reservoir_content', 'skin_temperature', 'snowmelt', 'soil_temperature_level_1',
    'soil_temperature_level_2', 'soil_temperature_level_3', 'soil_temperature_level_4', 'surface_net_solar_radiation', 'surface_pressure', 'volumetric_soil_water_layer_1',
    'volumetric_soil_water_layer_2', 'volumetric_soil_water_layer_3', 'volumetric_soil_water_layer_4', 'leaf_area_index_high_vegetation', 'leaf_area_index_low_vegetation']] = df[['lon', 'lat', 'dewpoint_temperature', 'temperature', 'evaporation_from_bare_soil', 'skin_reservoir_content', 'skin_temperature', 'snowmelt', 'soil_temperature_level_1',
    'soil_temperature_level_2', 'soil_temperature_level_3', 'soil_temperature_level_4', 'surface_net_solar_radiation', 'surface_pressure', 'volumetric_soil_water_layer_1',
    'volumetric_soil_water_layer_2', 'volumetric_soil_water_layer_3', 'volumetric_soil_water_layer_4', 'leaf_area_index_high_vegetation', 'leaf_area_index_low_vegetation']].astype(float)

df = df.replace(np.nan, None)


In [27]:
df

Unnamed: 0,lon,lat,datetime,dewpoint_temperature,temperature,evaporation_from_bare_soil,skin_reservoir_content,skin_temperature,snowmelt,soil_temperature_level_1,...,volumetric_soil_water_layer_1,volumetric_soil_water_layer_2,volumetric_soil_water_layer_3,volumetric_soil_water_layer_4,leaf_area_index_high_vegetation,leaf_area_index_low_vegetation,year,month,day,hour
0,-115.0,53.0,1995-01-01 04:00:00,258.381165,262.071106,-7.539711e-10,0.000115,260.917725,0.000000,270.601624,...,0.342025,0.326936,0.325364,0.339111,1.028697,0.847050,,,,
1,-115.0,53.0,1995-01-01 15:00:00,252.468689,256.917389,-1.885383e-09,0.000115,255.403214,0.000000,270.401154,...,0.341537,0.326920,0.325347,0.339111,1.028697,0.847050,,,,
2,-115.0,53.0,1995-01-02 04:00:00,256.430634,258.859070,-3.765308e-10,0.000115,258.125427,0.000000,270.490234,...,0.341080,0.326859,0.325333,0.339096,1.025895,0.846912,,,,
3,-115.0,53.0,1995-01-02 15:00:00,245.580505,248.302505,-3.922651e-08,0.000116,248.369461,0.000000,269.923767,...,0.340928,0.326813,0.325317,0.339081,1.025895,0.846912,,,,
4,-115.0,53.0,1995-01-03 04:00:00,253.586838,256.139221,0.000000e+00,0.000116,255.435898,0.000000,270.123779,...,0.340561,0.326721,0.325288,0.339081,1.023051,0.846912,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7497,-114.0,52.0,1995-01-29 15:00:00,264.683014,270.005188,-6.034497e-09,0.000144,267.600403,0.000000,268.843109,...,0.316573,0.274857,0.257156,0.262130,1.269526,2.305913,,,,
7498,-114.0,52.0,1995-01-30 04:00:00,271.451782,276.032104,-7.165909e-09,0.000070,274.736420,0.000056,271.746735,...,0.340500,0.275787,0.257156,0.262130,1.269279,2.305671,,,,
7499,-114.0,52.0,1995-01-30 15:00:00,266.943176,268.965576,-1.169246e-08,0.000142,265.785431,0.000108,271.336121,...,0.338028,0.277085,0.257156,0.262130,1.269279,2.305671,,,,
7500,-114.0,52.0,1995-01-31 04:00:00,270.431915,272.523651,-7.539711e-10,0.000088,271.186615,0.000000,271.692200,...,0.348098,0.278884,0.257172,0.262130,1.269032,2.305429,,,,


In [28]:
df = gpd.GeoDataFrame(df, crs="EPSG:4326", geometry=gpd.points_from_xy(df.lon, df.lat)) # Creates geometry from df using lon and lat as cords to create points (points being geometry)
df = df.to_crs(crs='EPSG:3347')                                                         # Changes the points projection to match the agriculture regions of EPSG:3347
df = gpd.sjoin(df, agRegions, how='left', predicate='within')                           # Join the two dataframes based on which points fit within what agriculture regions

In [30]:
df.drop(columns=['geometry', 'index_right'], inplace=True)

In [44]:
for index in range(len(df.index)):
    date = pd.Timestamp(np.datetime64(df.at[index, 'datetime']))
    df.at[index,'year'] = date.year
    df.at[index,'month'] = date.month
    df.at[index,'day'] = date.day
    df.at[index,'hour'] = date.hour

In [47]:
df.to_sql(TABLE, conn, schema='public', if_exists='append', index=False)

502

In [35]:
print(f'[SUCCESS] data was pulled for {year}/{month}')
db.cleanup()

[SUCCESS] data was pulled for 1995/1
