In [1]:
#  This notebook will look over the weather data from multible folders and subfolders and extract point values

In [23]:
# import all the things 
from multiprocessing import Pool
from tqdm.auto import tqdm
from pathlib import Path
import datetime as dt
import os
import re
import rasterio
import geopandas as gpd

In [24]:
# point this at the location data
input_folder = '/mnt/2TB Working/Projects/Dam forecast/v4'

# 'all dams centroids with water and dates sample.gpkg' is just a small sample use 'all dams centroids with water and dates.gpkg' for all dams
input_file = 'all dams centroids with water and dates WGS84.gpkg'
input_path = os.path.join(input_folder,input_file)
# make sure this exists
os.path.isfile(input_path)

True

In [25]:
# point this at your weatehr data folder
weather_folder = '/mnt/2TB Working/Projects/Dam forecast/Weather raster data'

In [26]:
# this is our output folder
time_step_folder = '/mnt/2TB Working/Projects/Dam forecast/v4/forecast/time steps'
# make this folder if it does not exist
Path(time_step_folder).mkdir(parents=True, exist_ok=True)

In [27]:
# depeding on how you got your weatehr data it may be in one of many raster formats, 
# this script works with both .nc and .adf files
# you can downlaod data from
# htitstps://dapds00.nci.org.au/thredds/catalogs/gh70/v2-0/v2-0.html

# get a refence to all raster files
raster_list = []
for root, dirs, files in os.walk(weather_folder):
    for file in files:
        if file.endswith(('1.adf','.nc','flt')):
            raster_list.append(os.path.join(root, file))
len(raster_list) 

951

In [28]:
# all raster files 
raster_list

['/mnt/2TB Working/Projects/Dam forecast/Weather raster data/VPD/y2010/vpd1008/w001001.adf',
 '/mnt/2TB Working/Projects/Dam forecast/Weather raster data/VPD/y2010/vpd1002/w001001.adf',
 '/mnt/2TB Working/Projects/Dam forecast/Weather raster data/VPD/y2010/vpd1010/w001001.adf',
 '/mnt/2TB Working/Projects/Dam forecast/Weather raster data/VPD/y2010/vpd2010annu/w001001.adf',
 '/mnt/2TB Working/Projects/Dam forecast/Weather raster data/VPD/y2010/vpd1006/w001001.adf',
 '/mnt/2TB Working/Projects/Dam forecast/Weather raster data/VPD/y2010/vpd1009/w001001.adf',
 '/mnt/2TB Working/Projects/Dam forecast/Weather raster data/VPD/y2010/vpd1004/w001001.adf',
 '/mnt/2TB Working/Projects/Dam forecast/Weather raster data/VPD/y2010/vpd1012/w001001.adf',
 '/mnt/2TB Working/Projects/Dam forecast/Weather raster data/VPD/y2010/vpd1001/w001001.adf',
 '/mnt/2TB Working/Projects/Dam forecast/Weather raster data/VPD/y2010/vpd1011/w001001.adf',
 '/mnt/2TB Working/Projects/Dam forecast/Weather raster data/VPD/y

In [29]:
# depedning on your data source the climate variable labels may be inconsistant
# the below variables are for rain and temp
include_types =['tavg','avgt','rain','Rain','Temp']

In [30]:
# limit raster list to only the varibles listed above
short_list=[]
for raster in raster_list:
    for inc_type in include_types:
        if inc_type in raster:
            short_list.append(raster)


In [31]:
print(len(short_list))
short_list

590


['/mnt/2TB Working/Projects/Dam forecast/Weather raster data/2019 - all rasters/rain_y2019/y2019/rain1902/w001001.adf',
 '/mnt/2TB Working/Projects/Dam forecast/Weather raster data/2019 - all rasters/rain_y2019/y2019/rain1908/w001001.adf',
 '/mnt/2TB Working/Projects/Dam forecast/Weather raster data/2019 - all rasters/rain_y2019/y2019/rain1905/w001001.adf',
 '/mnt/2TB Working/Projects/Dam forecast/Weather raster data/2019 - all rasters/rain_y2019/y2019/rain1910/w001001.adf',
 '/mnt/2TB Working/Projects/Dam forecast/Weather raster data/2019 - all rasters/rain_y2019/y2019/rain1912/w001001.adf',
 '/mnt/2TB Working/Projects/Dam forecast/Weather raster data/2019 - all rasters/rain_y2019/y2019/rain1901/w001001.adf',
 '/mnt/2TB Working/Projects/Dam forecast/Weather raster data/2019 - all rasters/rain_y2019/y2019/rain1911/w001001.adf',
 '/mnt/2TB Working/Projects/Dam forecast/Weather raster data/2019 - all rasters/rain_y2019/y2019/rain1909/w001001.adf',
 '/mnt/2TB Working/Projects/Dam forecast

In [None]:
# open vector data and reproject to WGS 84 if not already
points = gpd.read_file(input_path)
points

In [None]:
points = points.to_crs("EPSG:4326")

In [None]:
# get list of years and months which will be the end dates for the time series
years = list(range(2021,2022))
print(years)
months = list(range(1,13))
print(months)

In [None]:
# how many years of data did you want to extract?
historic_years = 1
historic_months = historic_years*12
print(historic_months,'months')

In [None]:
# func to calc the number of months between two dates
def diff_month(d1, d2):
    return (d1.year - d2.year) * 12 + d1.month - d2.month

In [None]:
# extracts metadata from nc files 
def nc_getter(raster):
    raster_name = os.path.basename(raster)
    raster_name_split = raster_name.split('_')
    raster_type = raster_name_split[2]
    if raster_type == 'tavg':
        raster_type = 'avgt'
    raster_year = int(raster_name_split[4][:4])
    raster_month = int(raster_name_split[4][4:6])
    return(raster_name,raster_type,raster_year,raster_month)

In [None]:
# extracts metadata from adf files
def adf_getter(raster):
    raster_name = raster.split('/')[-2]
    raster_type = re.search('(\D*)',raster_name)[0]
    if raster_type == 'tavg':
        raster_type = 'avgt'
    raster_year = int('20'+re.search('(\d\d)',raster_name)[0])
    raster_month = int(re.search('(\d\d$)',raster_name)[0])
    return(raster_name,raster_type,raster_year,raster_month)

In [None]:
adf_getter(short_list[0])

In [None]:
def flt_getter(raster):
    raster_name = os.path.basename(raster).replace('.flt','')
    raster_type = raster_name.split('_')[1]
    if raster_type == 'tavg':
        raster_type = 'avgt'
    raster_year = int(raster_name.split('_')[-1][0:4])
    raster_month = int(raster_name.split('_')[-1][4:6])
    return(raster_name,raster_type,raster_year,raster_month)

In [None]:
flt_getter(short_list[40])

In [None]:
# func to extract point values from raster, it will also pull raster meta with the funcs above
def extract_raster_vals(raster):
# exclude in annu in path, this removed yearly totals
    if 'annu' not in raster: 
#         check for adf files
        if raster.endswith('.adf'):
            raster_name,raster_type,raster_year,raster_month = adf_getter(raster)
#         if not adf assume its nc
        elif raster.endswith('.flt'):
            raster_name,raster_type,raster_year,raster_month = flt_getter(raster)

        else:
            raster_name,raster_type,raster_year,raster_month = nc_getter(raster)
#       build datetime object from raster date, we need to set a day so just pick 1st
        raster_date = dt.date(year=raster_year, month=raster_month, day=1)

        #checks if point values are more recent (or the same) as the raster
        if sample_date>=raster_date:

            #how many months between the raster and the sample
            month_dif = diff_month(sample_date,raster_date)
            
#           check month dif is within selected range
            if month_dif <= historic_months:
#               make column name from month dif (month offset)
                col_name = raster_type+'_'+str(month_dif)+'_'+'months before'
    
                #open raster
                src = rasterio.open(raster)
                raster_vals = []
#               pull out all point values at once
                for val in src.sample(sample_coord_list):
#                     print(val[0])
                    raster_vals.append(val[0])
                #return a dict of name and raster point values
                return {'name':col_name,'list':raster_vals}
        
#           if it failes at any the the steps aboce return an empty array 
            else:
                return []
        else:
            return []
    else:
        return []
        

In [None]:
# build list of each year and moth combo
start_times = []
for year in years:
    for month in months:
        start_times.append({'year' : year, 'month' : month})
len(start_times)

In [None]:
#grab a copy of the input data
sample_chunk = points
sample_coord_list = []
#       biuld list of coords to sample
print('setting up')
for index, row in tqdm(sample_chunk.iterrows(),total = sample_chunk.shape[0]):
    coord = (row['geometry'].x,row['geometry'].y)
    sample_coord_list.append(coord)


In [None]:
# loop over each year month combo
for time_step in tqdm(start_times):
#     extract each year and month 
    sample_year = time_step['year']
    sample_month = time_step['month']
    sample_date = dt.date(year=sample_year, month=sample_month, day=1)
#     build filename from start date
    file_name = str(sample_year)+'-'+str(sample_month)+'.csv'
    save_path = os.path.join(time_step_folder,file_name)
    print(save_path)
#     check if already done
    if not os.path.isfile(save_path):
#         use a multithreaded pool to extract data

        with Pool() as p:
            multi_cols = list(tqdm(p.imap(extract_raster_vals, short_list), 
                                   total=len(short_list),leave=False,
                                  desc='Month chunk '+str(sample_date)))

        for col in multi_cols:
            if type(col) == dict:
                sample_chunk[col['name']] = col['list']
#         export to disk
        sample_chunk.to_csv(save_path)


In [48]:
# for i in short_list:
#     extract_raster_vals(i)


IndexError: list index out of range