In [1]:
########
#
# in this script we will read in some ERA5 data and then country mask it.
# For all years from 1979-2020
#
# This script is a simple starter script, it can be adapted to read in
# multiple years of data or to read in different fields. 
# 
# Other functions are also available to load the data in the libraries: 
# - cfpython 
# - xarray
# - iris
#
#########

import numpy as np
from netCDF4 import Dataset
import matplotlib.pyplot as plt
import cartopy.io.shapereader as shpreader
import shapely.geometry
from datetime import datetime
from dateutil.relativedelta import *
import pandas as pd


def load_country_mask(COUNTRY,data_dir,filename,nc_key):

    '''

    This function loads the country masks for the ERA5 data grid we have been using

    Args:
        COUNTRY (str): This must be a name of a country (or set of) e.g. 
            'United Kingdom','France','Czech Republic'
 
       data_dir (str): The parth for where the data is stored.
            e.g '/home/users/zd907959/'

        filename (str): The filename of a .netcdf file
            e.g. 'ERA5_1979_01.nc'

        nc_key (str): The string you need to load the .nc data 
            e.g. 't2m','rsds'

    Returns:
       MASK_MATRIX_RESHAPE (array): Dimensions [lat,lon] where there are 1's if 
           the data is within a country border and zeros if data is outside a 
           country border. 


    '''


    # first loop through the countries and extract the appropraite shapefile
    countries_shp = shpreader.natural_earth(resolution='10m',category='cultural',
                                            name='admin_0_countries')
    country_shapely = []
    for country in shpreader.Reader(countries_shp).records():
        if country.attributes['NAME_LONG'] == COUNTRY:
            print('Found country')
            country_shapely.append(country.geometry)

    # load in the data you wish to mask
    file_str = data_dir + filename
    dataset = Dataset(file_str,mode='r')
    lons = dataset.variables['longitude'][:]
    lats = dataset.variables['latitude'][:]
    data = dataset.variables[nc_key][:] # data in shape [time,lat,lon]
    
    print(data.shape)
    dataset.close()

    # get data in appropriate units for models
    if nc_key == 't2m':
        data = data-273.15 # convert to Kelvin from Celsius
    if nc_key == 'ssrd':
        data = data/3600. # convert Jh-1m-2 to Wm-2

    LONS, LATS = np.meshgrid(lons,lats) # make grids of the lat and lon data
    x, y = LONS.flatten(), LATS.flatten() # flatten these to make it easier to 
    #loop over.
    points = np.vstack((x,y)).T
    MASK_MATRIX = np.zeros((len(x),1))
    # loop through all the lat/lon combinations to get the masked points
    for i in range(0,len(x)):
        my_point = shapely.geometry.Point(x[i],y[i]) 
        if country_shapely[0].contains(my_point) == True: 
            MASK_MATRIX[i,0] = 1.0 # creates 1s and 0s where the country is
    
    MASK_MATRIX_RESHAPE = np.reshape(MASK_MATRIX,(len(lats),len(lons)))


    return(MASK_MATRIX_RESHAPE)

In [2]:
def load_country_weather_data(MASK_MATRIX_RESHAPE, data_dir,filename,nc_key):
   
    '''
    
    This functions takes the ERA5 reanalysis data, loads it and applies a 
    pre-loaded country mask. It then takes the mean over that country mask
    and returns a time series of the data.
    
    Note tha unit conversions are currently only implemented for nc_keys of 't2m'
    and 'ssrd'
    
    Args:
    
    MASK_MATRIX_RESHAPE (array): Dimensions [lat,lon]. where there are 1s within a 
        country border and zeros outside it.
        
    data dir (str): the path for where the data is stored
    
    filename (str): the filesname of a .netcdf (.nc) file
    
    nc_key (str) : the string you need to load the .nc file e.g. 't2m' or 'ssrd'
    
    Returns:
    
        
    
    
    
    
    '''
    
    filestr = data_dir + filename
    dataset = Dataset(filestr,mode='r')
    lons = dataset.variables['longitude'][:]
    lats = dataset.variables['latitude'][:]
    data = dataset.variables[nc_key][:]
    dataset.close()
    
    if nc_key == 't2m':
        data = data-273.15 # convert from K to degCelsius
    if nc_key == 'ssrd':
        data = data/3600. #convert Jh-1m-2 to Wm-2
        
    country_masked_data = np.zeros(np.shape(data))
    for i in range(0,len(country_masked_data)):
        country_masked_data[i,:,:] = data[i,:,:]*MASK_MATRIX_RESHAPE
        
    country_masked_data[country_masked_data ==0.] = np.nan
    
    country_timeseries=np.nanmean(np.nanmean(country_masked_data,axis=2),axis=1)
                                 
    return(country_timeseries,lats,lons)
    

In [8]:
country_mask = load_country_mask('United Kingdom','/gws/pw/j05/cop26_hackathons/oxford/Data/ERA5_data_EU_domain/field_set_1/','ERA5_1hr_field_set_1_2018_01.nc','t2m')

Found country
(744, 149, 229)


In [9]:
def load_variable_data(var,field_set_num):

    ''' 
    This function collects the specified variable data for the UK from 1979-2020 on an hourly basis.
    It returns an array of the variables with the dims (# of years, # of hours in a year). This function does
    not account for leap-year days (Feb 29).
    
    Args:
        
    var (str): the name of the variable in the netCDF to collect (ex: t2m, ssrd, etc) 
    
    field_set_num (str): which directory contains the weather variable fields
    
    Returns:
    
    2d-array (array-obj): dims (# of years, 8760)
    
    '''
    ERA5_var_data = np.zeros([2021-1979,8760]) #number of years, number of hours in a standard year

    for qyear in range(1979,2021): # loop over the years

        aggregate_var = []

        print(qyear)
        for qmonth in range(1,13): # loop over the months
            #print(qmonth)
            qmonthchar = str(qmonth).zfill(2)
            qyearchar = str(qyear)

            file_loc = f'/gws/pw/j05/cop26_hackathons/oxford/Data/ERA5_data_EU_domain/field_set_{field_set_num}/'
            file_name = 'ERA5_1hr_field_set_1_' + qyearchar + '_' + qmonthchar + '.nc' 

            var_data,lats,lons =load_country_weather_data(country_mask,file_loc,file_name,var)

            aggregate_var.append(var_data)

        var_data = np.array([item for sublist in aggregate_var for item in sublist])
        ERA5_var_data[qyear-1979,:] = np.array(var_data)
        
    return ERA5_var_data

# Collect data 

In [10]:

# collect all the variables of interest from the ERA5 data
#ERA5_ssrd_data = load_variable_data("ssrd")
#ERA5_msl_data = load_variable_data("msl")
#ERA5_t2m_data = load_variable_data("t2m")
#ERA5_u10_data = load_variable_data("u10")
#ERA5_v10_data = load_variable_data("v10")
#ERA5_u100_data = load_variable_data("u100")
#ERA5_v100_data = load_variable_data("v100")
#ERA5_v100_data = load_variable_data("v100","1")
ERA5_cc_data = load_variable_data("u100","3")
ERA5_precip_data = load_variable_data("v100","2")

1979




KeyboardInterrupt: 

### Convert country data to CSV file

In [9]:
# flatten ERA5_data array (year, hour) to (hour,)
ERA5_t2m_hourly = ERA5_t2m_data.flatten()
ERA5_ssrd_hourly = ERA5_ssrd_data.flatten()
ERA5_msl_hourly = ERA5_msl_data.flatten()/100

# calculate wind speeds & flatten
ERA5_w10_data = np.sqrt(ERA5_u10_data**2 + ERA5_v10_data**2)
ERA5_w100_data = np.sqrt(ERA5_u100_data**2 + ERA5_v100_data**2)
ERA5_w10_hourly = ERA5_w10_data.flatten()
ERA5_w100_hourly = ERA5_w100_data.flatten()

# generate hourly values for daterange Jan 1, 1979 - Dec 31, 2020
hours = pd.date_range(datetime(1979,1,1,0), datetime(2020,12,31,23), freq = "1H")

# array lengths do not match. Because leap year days are not accounted for. 
print(hours.shape)
print(ERA5_t2m_hourly.shape)

(368184,)


In [10]:
# remove all Feb 29 days from hourly values
ERA5_t2m_hours = hours[hours.strftime("%m%d") != "0229"]
ERA5_ssrd_hours = hours[hours.strftime("%m%d") != "0229"]
ERA5_msl_hours = hours[hours.strftime("%m%d") != "0229"]
ERA5_w10_hours = hours[hours.strftime("%m%d") != "0229"]
ERA5_w100_hours = hours[hours.strftime("%m%d") != "0229"]

# lengths match now
print(ERA5_t2m_hourly.shape)
print(ERA5_t2m_hours.shape)

NameError: name 'ERA5_t2m_hourly' is not defined

In [212]:
# read in monthly NAO values from NCDC.NOAA
nao_monthly = pd.read_csv("https://www.ncdc.noaa.gov/teleconnections/nao/data.csv",header=1)
nao_monthly.Date = pd.to_datetime(nao_monthly.Date,format="%Y%m")

# select time range 1979-2020
nao_monthly = nao_monthly[(nao_monthly.Date>=datetime(1979,1,1,0)) & (nao_monthly.Date<=datetime(2021,1,1,0))]

# convert monthly timescale to hourly timescale using repeated values
nao_hourly = nao_monthly.set_index("Date").resample("1H").pad()

# remove Feb 29th and Jan 1, 2021
hourly_nao_value = nao_hourly[nao_hourly.index.strftime("%m%d") != "0229"][:-1].Value


Create Pandas dataframe and produce CSV

In [214]:
# create dataframe
df = pd.DataFrame({"t2m":ERA5_t2m_hourly, "ssrd":ERA5_ssrd_hourly, "msl":ERA5_msl_hourly,"w10":ERA5_w10_hourly,
                   "w100":ERA5_w100_hourly,"nao":hourly_nao_value})
df.index = ERA5_t2m_hours

# produce CSV
df.to_csv("country_data/ERA5_t2m_hourly_UK.csv")