In [1]:
#  This notebook will look over the weather data from multible folders and subfolders and extract point values

In [1]:
# import all the things 
from multiprocess import Pool
from tqdm.auto import tqdm
from pathlib import Path
import datetime as dt
import os
# import re
import rasterio
import geopandas as gpd
from threading import Thread
import pandas as pd

In [2]:
min_year = 2020
max_year = 2021

In [3]:
# point this at the location data
input_folder = '/Users/nicholaswright/Documents/Dam-Forecast/data'#'/mnt/2TB Working/Projects/Dam forecast/v4'

# 'all dams centroids with water and dates sample.gpkg' is just a small sample use 'all dams centroids with water and dates.gpkg' for all dams
input_file = 'all dams centroids with water and dates sample.gpkg'#'all dams centroids with water and dates WGS84.gpkg'
input_path = os.path.join(input_folder,input_file)
# make sure this exists
os.path.isfile(input_path)

True

In [4]:
# point this at your weatehr data folder
weather_folder = '/Users/nicholaswright/Downloads/test_dl'#'/mnt/2TB Working/Projects/Dam forecast/Weather raster data'

In [5]:
# this is our output folder
time_step_folder = '/Users/nicholaswright/Downloads/feather files'#'/mnt/2TB Working/Projects/Dam forecast/v4/forecast/time steps'
# make this folder if it does not exist
Path(time_step_folder).mkdir(parents=True, exist_ok=True)

In [6]:
climate_types = ['rain','tavg']

In [7]:
# depeding on how you got your weatehr data it may be in one of many raster formats, 
# this script works with both .nc and .adf files
# you can downlaod data from
# htitstps://dapds00.nci.org.au/thredds/catalogs/gh70/v2-0/v2-0.html

# get a refence to all raster files
raster_list = []
for root, dirs, files in os.walk(weather_folder):
    for file in files:
        if file.endswith(('1.adf','.nc','flt')):
            raster_list.append(os.path.join(root, file))
len(raster_list) 

72

In [8]:
# some raster files 
raster_list[:10]

['/Users/nicholaswright/Downloads/test_dl/ANUClimate_v2-0_rain_monthly_202103.nc',
 '/Users/nicholaswright/Downloads/test_dl/ANUClimate_v2-0_rain_monthly_202012.nc',
 '/Users/nicholaswright/Downloads/test_dl/ANUClimate_v2-0_tavg_monthly_202008.nc',
 '/Users/nicholaswright/Downloads/test_dl/ANUClimate_v2-0_rain_monthly_201905.nc',
 '/Users/nicholaswright/Downloads/test_dl/ANUClimate_v2-0_tavg_monthly_202109.nc',
 '/Users/nicholaswright/Downloads/test_dl/ANUClimate_v2-0_rain_monthly_202002.nc',
 '/Users/nicholaswright/Downloads/test_dl/ANUClimate_v2-0_rain_monthly_202107.nc',
 '/Users/nicholaswright/Downloads/test_dl/ANUClimate_v2-0_rain_monthly_201901.nc',
 '/Users/nicholaswright/Downloads/test_dl/ANUClimate_v2-0_rain_monthly_201911.nc',
 '/Users/nicholaswright/Downloads/test_dl/ANUClimate_v2-0_rain_monthly_202006.nc']

In [9]:
# depedning on your data source the climate variable labels may be inconsistant
# the below variables are for rain and temp
include_types =['tavg','avgt','rain','Rain','Temp']

In [10]:
# limit raster list to only the varibles listed above
short_list=[]
for raster in raster_list:
    for inc_type in include_types:
        if inc_type in raster:
            short_list.append(raster)


In [11]:
file_names = []
for raster in short_list:
    file_name = os.path.basename(raster)
    file_names.append(file_name)
len(file_names)

72

In [12]:
# get list of years and months which will be the end dates for the time series
years = list(range(min_year,max_year+1))
print(years)
months = list(range(1,13))
print(months)

[2020, 2021]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


In [16]:
missing_data = False
for year in range(min_year-1,max_year+1):
    for month in months:
        month = str(month).zfill(2)
        date = f'{year}{month}'
        for climate_type in climate_types:
            check_name = f'ANUClimate_v2-0_{climate_type}_monthly_{date}.nc'
            if check_name not in file_names:
                print(check_name, 'missing')
                missing_data = True
if missing_data:
    print('You have missing climate data!')

In [28]:
# open vector data
points = gpd.read_file(input_path)
points.head()

Unnamed: 0,file_name,area,area_2,SRC_DATE,file_name_pred,class,class_name,geometry
0,arcgisonline_1686584.tif,1472.265188,293.852789,20160217.0,arcgisonline_1686584_pred_and_score.tif,0,dam and water,POINT (-943988.599 -3637895.597)
1,arcgisonline_1686963.tif,3511.531725,1348.727385,20150121.0,arcgisonline_1686963_pred_and_score.tif,0,dam and water,POINT (-965660.198 -3651584.893)
2,arcgisonline_437392.tif,7400.931025,,20161230.0,arcgisonline_437392_pred_and_score.tif,2,no water,POINT (1448235.502 -3334977.496)
3,arcgisonline_1747284.tif,2925.907475,2924.952538,20160302.0,arcgisonline_1747284_pred_and_score.tif,1,no dam,POINT (1299932.576 -4777809.258)
4,arcgisonline_411890.tif,5270.39767,15.310323,20150315.0,arcgisonline_411890_pred_and_score.tif,2,no water,POINT (1189137.120 -3799738.803)


In [30]:
# reproject to WGS 84 if not already
if points.crs.to_epsg() != 4326:
    points = points.to_crs("EPSG:4326")

In [31]:
# how many years of data did you want to extract?
historic_years = 1
historic_months = historic_years*12
print(historic_months,'months')

12 months


In [32]:
# func to calc the number of months between two dates
def diff_month(d1, d2):
    return (d1.year - d2.year) * 12 + d1.month - d2.month

In [33]:
# extracts metadata from nc files 
def nc_getter(raster):
    raster_name = os.path.basename(raster)
    raster_name_split = raster_name.split('_')
    raster_type = raster_name_split[2]
    if raster_type == 'tavg':
        raster_type = 'avgt'
    raster_year = int(raster_name_split[4][:4])
    raster_month = int(raster_name_split[4][4:6])
    return(raster_name,raster_type,raster_year,raster_month)

In [34]:
# # extracts metadata from adf files
# def adf_getter(raster):
#     raster_name = raster.split('/')[-2]
#     raster_type = re.search('(\D*)',raster_name)[0]
#     if raster_type == 'tavg':
#         raster_type = 'avgt'
#     raster_year = int('20'+re.search('(\d\d)',raster_name)[0])
#     raster_month = int(re.search('(\d\d$)',raster_name)[0])
#     return(raster_name,raster_type,raster_year,raster_month)

In [35]:
# def flt_getter(raster):
#     raster_name = os.path.basename(raster).replace('.flt','')
#     raster_type = raster_name.split('_')[1]
#     if raster_type == 'tavg':
#         raster_type = 'avgt'
#     raster_year = int(raster_name.split('_')[-1][0:4])
#     raster_month = int(raster_name.split('_')[-1][4:6])
#     return(raster_name,raster_type,raster_year,raster_month)

In [51]:
# func to extract point values from raster, it will also pull raster meta with the funcs above
def extract_raster_vals(raster):

    raster_name,raster_type,raster_year,raster_month = nc_getter(raster)
#       build datetime object from raster date, we need to set a day so just pick 1st
    raster_date = dt.date(year=raster_year, month=raster_month, day=1)

    #checks if point values are more recent (or the same) as the raster
    if sample_date>=raster_date:

        #how many months between the raster and the sample
        month_dif = diff_month(sample_date,raster_date)

#           check month dif is within selected range
        if month_dif <= historic_months:
#               make column name from month dif (month offset)
            col_name = raster_type+'_'+str(month_dif)+'_'+'months before'

            #open raster
            src = rasterio.open(raster)
            raster_vals = []
#               pull out all point values at once
            for val in src.sample(sample_coord_list):
#                     print(val[0])
                raster_vals.append(val[0])
            #return a dict of name and raster point values
            return {'date' : col_name,
                    'point_values' : raster_vals}


#     return None if raster not used
    return None

        

In [39]:
# build list of each year and moth combo
start_times = []
for year in years:
    for month in months:
        start_times.append({'year' : year, 'month' : month})
len(start_times)

24

In [40]:
#grab a copy of the input data
sample_chunk = points
sample_coord_list = []
#       biuld list of coords to sample
for index, row in tqdm(sample_chunk.iterrows(),total = sample_chunk.shape[0]):
    coord = (row['geometry'].x,row['geometry'].y)
    sample_coord_list.append(coord)


  0%|          | 0/3000 [00:00<?, ?it/s]

In [41]:
sample_chunk.head()

Unnamed: 0,file_name,area,area_2,SRC_DATE,file_name_pred,class,class_name,geometry
0,arcgisonline_1686584.tif,1472.265188,293.852789,20160217.0,arcgisonline_1686584_pred_and_score.tif,0,dam and water,POINT (121.81365 -33.00993)
1,arcgisonline_1686963.tif,3511.531725,1348.727385,20150121.0,arcgisonline_1686963_pred_and_score.tif,0,dam and water,POINT (121.56879 -33.11645)
2,arcgisonline_437392.tif,7400.931025,,20161230.0,arcgisonline_437392_pred_and_score.tif,2,no water,POINT (147.19718 -29.86992)
3,arcgisonline_1747284.tif,2925.907475,2924.952538,20160302.0,arcgisonline_1747284_pred_and_score.tif,1,no dam,POINT (147.48199 -42.99337)
4,arcgisonline_411890.tif,5270.39767,15.310323,20150315.0,arcgisonline_411890_pred_and_score.tif,2,no water,POINT (144.99259 -34.25548)


In [52]:
def compile_and_export(multi_cols,save_path):
    monthly_point_sample = {}
    for data_sample in multi_cols:
        if data_sample != None:
            monthly_point_sample[data_sample['date']] = data_sample['point_values']

    point_sample_df = pd.DataFrame(monthly_point_sample)
    joined_data = pd.merge(sample_chunk, point_sample_df, left_index=True, right_index=True)
    joined_data.to_pickle(save_path)


In [53]:
# loop over each year month combo
for time_step in tqdm(start_times):
#     extract each year and month 
    sample_year = time_step['year']
    sample_month = time_step['month']
    sample_date = dt.date(year=sample_year, month=sample_month, day=1)
#     print(sample_date)
#     build filename from start date
    file_name = str(sample_year)+'-'+str(sample_month)+'.pkl'
    save_path = os.path.join(time_step_folder,file_name)
#     check if already done
    if not os.path.isfile(save_path):
#         use a multithreaded pool to extract data

        with Pool() as p:
            multi_cols = list(tqdm(p.imap(extract_raster_vals, short_list), 
                                   total=len(short_list),leave=False,
                                  desc='Month chunk '+str(sample_date)))

        export = Thread(target=compile_and_export, args=(multi_cols,save_path))
        export.start()

  0%|          | 0/24 [00:00<?, ?it/s]

Month chunk 2020-01-01:   0%|          | 0/72 [00:00<?, ?it/s]

Month chunk 2020-02-01:   0%|          | 0/72 [00:00<?, ?it/s]

Month chunk 2020-03-01:   0%|          | 0/72 [00:00<?, ?it/s]

Month chunk 2020-04-01:   0%|          | 0/72 [00:00<?, ?it/s]

Month chunk 2020-05-01:   0%|          | 0/72 [00:00<?, ?it/s]

Month chunk 2020-06-01:   0%|          | 0/72 [00:00<?, ?it/s]

Process ForkPoolWorker-43:


KeyboardInterrupt: 