# Start

This script fills in slope, relief, aspect, vegetation cover data into in-situ observations. It can only be used to filled in vegetation cover value that are saved individually for each site. 

In [None]:
# Set variables
working_dir = '../..'  # This repository's root directory
insitu_data_path = 'Data/pcs/pcs.csv'
output_dir_path = 'output/csv/in-situ_topography_pcs.csv'

In [None]:
import glob
import os
import sys

import numpy as np
import rasterio
from pyproj import Transformer
from rasterio.merge import merge
from rasterio.transform import rowcol
from rasterio.warp import transform
from tqdm import tqdm

tqdm.pandas()

sys.path.append(working_dir)
from Utils.barra2 import *
from Utils.vpd import calculate_vpd

In [None]:
# Generate output directory

output_dir = os.path.join(working_dir, 'output', 'csv')
os.makedirs(output_dir, exist_ok=True)

In [None]:
# Read the data

df = pd.read_csv(os.path.join(working_dir, insitu_data_path))
df.head()

In [None]:
# Make sites_df to fill in slope, aspect, and relief

sites_df = df[['SiteID', 'X', 'Y']].drop_duplicates(ignore_index=True)
sites_df

## Fill in slope, aspect, and relief of each site

In [None]:
# Add slope, aspect, and relief data tp sites_df

transformer = Transformer.from_crs('EPSG:4326', "EPSG:28355", always_xy=True)


def get_value_from_latlon(data, trans, x, y):
    # Find corresponding row and column with latlon
    x_proj, y_proj = transformer.transform([x], [y])
    row, col = rowcol(trans, x_proj, y_proj)

    try:
        return data[row, col][0]
    except:
        print(
            f"There is a row with lon {x}, lat {y} outside tif files' area. The value is set to np.nan"
        )
        return np.nan


def add_data_to_df(df, property, tif_files_keyword=""):
    # Find all related tif files
    files = glob.glob(os.path.join(working_dir, 'output', 'tif', f'*{property}.tif'))
    print(f"There are {len(files)} {property} files: {files}")

    if tif_files_keyword == "":
        # Merge all tif files
        src_files_to_mosaic = [rasterio.open(f) for f in files]
        mosaic, out_trans = merge(src_files_to_mosaic)
        data = mosaic[0]
    else:
        data_path = [file for file in files if tif_files_keyword in file][0]
        with rasterio.open(data_path) as src:
            data = src.read(1)
            out_trans = src.transform

    # Add data to df
    df[property] = df.apply(
        lambda row: get_value_from_latlon(data, out_trans, row['X'], row['Y']), axis=1
    )
    return df

In [None]:
sites_df = add_data_to_df(sites_df, 'slope', tif_files_keyword="pcs")
sites_df = add_data_to_df(sites_df, 'aspect', tif_files_keyword="pcs")
sites_df = add_data_to_df(sites_df, 'relief', tif_files_keyword="pcs")
sites_df

In [None]:
# Investigate missing values
sites_with_nans_df = sites_df[
    (sites_df['slope'].isna()) | (sites_df['aspect'].isna()) | (sites_df['relief'].isna())
]
sites_with_nans_df

## Fill in vegetation cover

In [None]:
# Find X, Y in DEA crs for each site

transformer = Transformer.from_crs("EPSG:4326", "EPSG:32754", always_xy=True)
sites_df['dea_x'], sites_df['dea_y'] = transformer.transform(
    sites_df['X'].values, sites_df['Y'].values
)
sites_df.head()

In [None]:
# Read veg cover NetCFD of each site and put it in sites_df


def get_veg_ds(row):
    site_id = row['SiteID']
    veg_cover_data_path = os.path.join(working_dir, "Data", "veg_cover", f"veg_cover_{site_id}.nc")
    veg_ds = xr.open_dataset(veg_cover_data_path)

    return veg_ds['veg_cover']


sites_df['veg_cover_point_series'] = sites_df.apply(lambda row: get_veg_ds(row), axis=1)

In [None]:
# Add the site data to df

df = df.merge(
    sites_df[["SiteID", "slope", "aspect", "relief", 'dea_x', 'dea_y', 'veg_cover_point_series']],
    on=['SiteID'],
    how='left',
)
df.head()

In [None]:
# Fill in veg cover in df row by row


def get_veg_cover(row):
    x = row['dea_x']
    y = row['dea_y']
    t = row['UTC_Datetime']
    point_series = row['veg_cover_point_series']

    # Drop NaNs
    valid_series = point_series.dropna(dim='time')

    if valid_series.sizes['time'] == 0:
        return np.nan

    # Find the nearest time, within 30 days, with non-NaN veg_cover
    time_deltas = np.abs(valid_series['time'] - np.datetime64(t))
    if time_deltas.min() > np.timedelta64(30, 'D'):
        print("min time_deltas: ", time_deltas.min() / np.timedelta64(1, 'D'))
        return np.nan
    nearest_idx = time_deltas.argmin().item()
    veg_cover = valid_series.isel(time=nearest_idx).item()

    return veg_cover if veg_cover < 100 else 100


df['veg_cover'] = df.progress_apply(get_veg_cover, axis=1)
df.head()

# Save df to csv

In [None]:
# save df
df.drop(columns=['dea_x', 'dea_y', 'veg_cover_point_series'], inplace=True)
df.to_csv(os.path.join(output_dir, output_dir_path), index=False)
df.head()