This notebook will look over the weather data and extract point values

In [1]:
from tqdm.auto import tqdm
from pathlib import Path
import os
import zipfile
from glob import glob

from multiprocessing import Pool
import rasterio
import geopandas as gpd
import pandas as pd

In [2]:
# unzip point data file from repo
zipped_point_data = os.path.join(os.getcwd(),'data','all dams centroids with water and dates.zip')
with zipfile.ZipFile(zipped_point_data, 'r') as zip_ref:
    zip_ref.extractall(os.path.dirname(zipped_point_data))

In [3]:
point_data_input_path = zipped_point_data.replace('.zip','.gpkg')
os.path.isfile(point_data_input_path)

True

In [4]:
# read in vars from downlaod notebook
%store -r dam_forcast_working_dir
%store -r ANU_cliamte_downlaod_dir
%store -r min_year
%store -r max_year
%store -r climate_types
%store point_data_input_path

Stored 'point_data_input_path' (str)


In [5]:
# output folder for the time step data
time_step_folder = os.path.join(dam_forcast_working_dir,'monthly points')
%store time_step_folder
# create this folder if it does not exist
Path(time_step_folder).mkdir(parents=True, exist_ok=True)
time_step_folder

Stored 'time_step_folder' (str)


'/home/nick/Documents/Work code/Weather-to-water/working/monthly points'

In [6]:
# get a list to all raster files
raster_list = glob(ANU_cliamte_downlaod_dir+'/*.nc')
len(raster_list) 

144

In [7]:
# open vector data
dam_points = gpd.read_file(point_data_input_path)
dam_points.head()

Unnamed: 0,file_name,dam_area,water_area,SRC_DATE,file_name_pred,class,class_name,geometry
0,arcgisonline_1286093.tif,1782.758036,1231.106859,20170126.0,arcgisonline_1286093_pred_and_score.tif,0,dam and water,POINT (1317606.405 -4019274.986)
1,arcgisonline_1337387.tif,1917.350912,1042.750433,20151214.0,arcgisonline_1337387_pred_and_score.tif,0,dam and water,POINT (1213139.420 -4269770.352)
2,arcgisonline_1702518.tif,2300.790143,698.20756,20141023.0,arcgisonline_1702518_pred_and_score.tif,0,dam and water,POINT (1172386.054 -4556884.461)
3,arcgisonline_1274916.tif,403.480805,90.273377,20160112.0,arcgisonline_1274916_pred_and_score.tif,0,dam and water,POINT (826008.345 -4034749.462)
4,arcgisonline_704977.tif,1774.881116,749.984968,20160502.0,arcgisonline_704977_pred_and_score.tif,0,dam and water,POINT (1651414.073 -3700910.019)


In [8]:
# reproject to WGS 84 if not already
if dam_points.crs.to_epsg() != 4326:
    dam_points = dam_points.to_crs("EPSG:4326")

In [9]:
# get xy coords from df
coord_list = list(zip(dam_points['geometry'].x, dam_points['geometry'].y))
coord_list[0]

(146.6588069486813, -36.09348834105702)

In [12]:
# this func will open a climate file and extract the point values
def extract_points_from_raster(raster_path):
    file_name = os.path.basename(raster_path)
#     get the date from the file name
    date = file_name.split("_")[-1].replace('.nc','')
#     get the climate type from the file name
    climate_type = os.path.basename(raster_path).split("_")[2]
    export_file_name = f'{date}_{climate_type}.pkl'
    export_path = os.path.join(time_step_folder,export_file_name)
#     skip if aready done
    if not os.path.isfile(export_path):
#         open raster
        src = rasterio.open(raster_path)
        raster_vals = []
#         point sample raster using rio
        for coords in src.sample(coord_list):
            raster_vals.append(coords[0])
#         place into df
        point_samp_df = pd.DataFrame(raster_vals,columns=[date])
#         save to disk
        point_samp_df.to_pickle(export_path)

        return export_path
    return None

In [11]:
# extract point samples with multiprocessing
with Pool() as p:
    export_paths = list(tqdm(p.imap(extract_points_from_raster,raster_list),
                             total=len(raster_list),desc = 'Extracting points from rasters'))

Extracting points from rasters:   0%|          | 0/144 [00:00<?, ?it/s]