pip install cdsapi  
pip install pygrid

In [474]:
import os, sys, cdsapi, pygrib, calendar
from QueryHandler import QueryHandler
from shapely.geometry import Point
from dotenv import load_dotenv
import sqlalchemy as sq 
import geopandas as gpd
import numpy as np

sys.path.append('../')
from DataService import DataService


load_dotenv()
PG_DB = os.getenv('POSTGRES_DB')
PG_ADDR = os.getenv('POSTGRES_ADDR')
PG_PORT = os.getenv('POSTGRES_PORT')
PG_USER = os.getenv('POSTGRES_USER')
PG_PW = os.getenv('POSTGRES_PW')

MIN_MONTH = 3
MAX_MONTH = 12

MIN_YEAR = 1995
MAX_YEAR = 2023

In [475]:
c = cdsapi.Client()
queryHandler = QueryHandler()
db = DataService(PG_DB, PG_ADDR, PG_PORT, PG_USER, PG_PW)

In [476]:
def createTable(db):
    query = sq.text(queryHandler.tableExistsReq('copernicus_satelite_data'))
    tableExists = queryHandler.readTableExists(db.execute(query))
    
    if not tableExists:
        query = sq.text(queryHandler.createCopernicusTableReq())
        db.execute(query)

In [477]:
def calcAgRegion(agRegions: gpd.GeoDataFrame, point: Point) -> str:
    area = ''

    for index, region in agRegions.iterrows():
        if region['geometry'].contains(point)[0]:
            area = region['car_name']
            break
    
    return area

In [478]:
def storeData(db, lon, lat, year, month, day, hour, region, attr, value):
    datetime = datetime = np.datetime64(f'{year}-{month}-{day}T{hour}')
    query = sq.text(queryHandler.createRowExistsInDBReq(lon, lat, datetime))
    rowExists = queryHandler.readRowExistsInDB(db.execute(query))
    hour = int(hour.split(':')[0])
    
    if rowExists:
        query = sq.text(queryHandler.createUpdateRowReq(lon, lat, datetime, attr, value))
        db.execute(query)
    else:
        query = sq.text(queryHandler.createInsertRowReq(lon, lat, datetime, year, month, day, hour, region, attr, value))
        db.execute(query)

In [479]:
def processFile(grbs, db, agRegions: gpd.GeoDataFrame, year, month, day, hour, attr):
    listOfdata, listOfLats, listOfLons = grbs[1].data()

    for listIndex, list in enumerate(listOfdata):
        for dataIndex, data in enumerate(list):
            x = listOfLons[listIndex][dataIndex]
            y = listOfLats[listIndex][dataIndex]

            point = Point(x, y)
            point = gpd.GeoSeries(point, crs='EPSG:4326') 
            region = calcAgRegion(agRegions, point)

        if region:
            print('about to store in db')
            storeData(db, x, y, year, month, day, hour, region, attr, data)

In [480]:
def loadGeometry(conn) -> gpd.GeoDataFrame:
    query = sq.text('select car_name, geometry FROM public.census_ag_regions')
    agRegions = gpd.GeoDataFrame.from_postgis(query, conn, geom_col='geometry') # crs='EPSG:3347'
    agRegions = agRegions.set_crs("EPSG:3347", allow_override=True)
    agRegions = agRegions.to_crs("EPSG:4326")

    return agRegions
    

In [481]:
years = [str(year) for year in range(MIN_YEAR, MAX_YEAR + 1)]
months = [str(month) for month in range(MIN_MONTH, MAX_MONTH + 1)]

attrs = [
    '2m_dewpoint_temperature', '2m_temperature', 'evaporation_from_bare_soil', 'skin_reservoir_content', 'skin_temperature',
    'snowmelt', 'soil_temperature_level_1', 'soil_temperature_level_2', 'soil_temperature_level_3', 'soil_temperature_level_4',
    'surface_net_solar_radiation', 'surface_pressure', 'volumetric_soil_water_layer_1', 'volumetric_soil_water_layer_2', 
    'volumetric_soil_water_layer_3', 'volumetric_soil_water_layer_4'
]

hours = [
    '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00', '08:00', '09:00', '10:00', '11:00','12:00', '13:00', 
    '14:00', '15:00', '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00'
]


conn = db.connect()

createTable(db)
agRegions = loadGeometry(conn)

for year in years:
    print(f'Pulling data for {year} ...')

    for month in months:
        numDays = calendar.monthrange(int(year), int(month))[1]
        days = [str(day) for day in range(1, numDays + 1)]

        for day in days:
            for hour in hours:
                for attr in attrs:
                    c.retrieve(
                        'reanalysis-era5-land',
                        {
                            'format': 'grib',
                            'variable': [attr],
                            'year': year,
                            'month': month,
                            'day': [day],
                            'time': [hour],
                            'area': [61, -125, 48, -88],
                        },
                        'download.grib'
                    )

                    if attr == '2m_dewpoint_temperature' or attr == '2m_temperature':
                        attr = attr[3:]

                    # read the file, process it, delete it then go onto the next set of data
                    grbs = pygrib.open('download.grib')
                    processFile(grbs, db, agRegions, year, month, day, hour, attr)
                    os.remove('download.grib')

print('[SUCCESS] data pulled successfully')

Pulling data for 1995 ...


2023-06-11 22:16:06,666 INFO Welcome to the CDS
2023-06-11 22:16:06,668 INFO Sending request to https://cds.climate.copernicus.eu/api/v2/resources/reanalysis-era5-land
2023-06-11 22:16:06,857 INFO Request is queued
2023-06-11 22:16:08,003 INFO Request is running
2023-06-11 22:16:09,651 INFO Request is completed
2023-06-11 22:16:09,653 INFO Downloading https://download-0014-clone.copernicus-climate.eu/cache-compute-0014/cache/data9/adaptor.mars.internal-1686521766.93957-7263-7-b2e27730-c39c-44c7-a520-b3cdd58f590d.grib to download.grib (96K)
2023-06-11 22:16:10,803 INFO Download rate 83.6K/s  


processing file
