# Start

This script cleans field data from Nick's PhD and enriches it with slope, aspect, relief, and vegetation cover values.

## 📄 **What this script does**
1. **Cleans** the field data by selecting only observations where `Height` equals 1, and `Datetime` is after the installation date/time and before the removal date/time.
2. Removes all columns except `SiteID`, `X`, `Y`, `Datetime`, `C`, and `RH`. The `C` column is renamed to `Temperature`.
3. **Corrects relative humidity** (`RH`) using equations from the supplementary material of this publication: [https://doi.org/10.1016/j.agrformet.2013.03.008](https://doi.org/10.1016/j.agrformet.2013.03.008).
4. **Calculates VPD** (vapor pressure deficit) from the corrected temperature and RH.
5. Generates `site_data_summary.csv`, which contains the list of each site (with X and Y coordinates) and the dates for which data is available.
6. For each site in `sites_df`, **fills in slope, aspect, and relief** using precomputed `.tif` files from `topography_calculation.ipynb`. These topographic values are then used to enrich the full field dataset.
7. The vegetation cover data is derived from the `veg_cover` field of the DEA Fractional Cover product, retrieved via the ARE NCI. For each observation, **the `veg_cover` value is taken from the `veg_cover` data point closest in time and location.**
8. The resulting observations, with their slope, aspect, relief, and vegetation cover, are saved as `in-situ_topography.csv`.

## ⚠️ **Important notes**
* **Before running the script**, set all variables in the first cell, and delete the second cell if not using a Google Colab environment.  
  *(The script was developed for use in Google Colab and has not been tested outside of it.)*
* The `DEA_Fractional_cover_veg_cover.nc` file, which contains vegetation cover data from the DEA Fractional Cover product, has an incorrect `crs` attribute. The correct coordinate reference system is EPSG:32754.
* For site observations that the corresponding vegetation cover data in DEA Fractional Cover product are `NaN`s, **the nearest valid vegetation cover values in time are used.**

# TODO (MEDIUM): Edit veg_cover related content






In [None]:
# Set variables
data_file_name = "T_RH_2020-11-10(in).csv"
data_url = "https://anu365.sharepoint.com/:x:/r/sites/ANU-OptusBushfireResearchCoE/Shared%20Documents/Projects/2024_5%20NSSN%20FMC%20monitoring/T_RH_2020-11-10.csv?d=w7e97fe709fc24f839be6d929d48bde0e&csf=1&web=1&e=3XhSoD"

In [None]:
import sys
sys.path.append('..')
from Utils.vpd import calculate_vpd
from Utils.barra2 import *

In [None]:
# Generate output directory

import os
output_dir = os.path.join("..", "output/csv")
os.makedirs(output_dir, exist_ok=True)

In [None]:
# Download data with a link

import os
import requests

def download_file(url, filename):
  try:
    response = requests.get(url, stream=True)
    response.raise_for_status()  # Raise an exception for bad status codes

    with open(filename, 'wb') as file:
      for chunk in response.iter_content(chunk_size=8192):
        file.write(chunk)
    print(f"Data downloaded successfully to {filename}")
  except requests.exceptions.RequestException as e:
    print(f"Error downloading data: {e}")

data_path = os.path.join("..", "Data", data_file_name)
if not os.path.exists(data_path):
    download_file(data_url, data_path)

# Read and explore the data

In [None]:
# Read data

import pandas as pd

df = pd.read_csv(data_path)
pd.set_option('display.max_columns', None)
df.head()

In [None]:
df.info()

# Data cleaning and correction

In [None]:
# Convert all dates into datetime objects

from datetime import datetime

format = '%m/%d/%Y %H:%M'

df['Instalation time'] = df['Instalation time'].astype(str).str.zfill(4)
datetime_str = df['Instalation date'].astype(str) + ' ' + df['Instalation time'].str.slice(0, 2) + ':' + df['Instalation time'].str.slice(2, 4)
df['Instalation datetime'] = pd.to_datetime(datetime_str, format=format)

df['Removal time'] = df['Removal time'].astype(str).str.zfill(4)
datetime_str = df['Removal date'].astype(str) + ' ' + df['Removal time'].str.slice(0, 2) + ':' + df['Removal time'].str.slice(2, 4)
df['Removal datetime'] = pd.to_datetime(datetime_str, format=format)

df['Date'] = pd.to_datetime(df['Date'], format=format)
df = df.rename(columns={'Date': 'Datetime'})

df.head()

In [None]:
# Data cleaning

df = df[df['Height'] != 'zero']  # 120 to 108 sites
df = df[(df['Datetime'] > df['Instalation datetime']) & (df['Datetime'] < df['Removal datetime'])]  # 108 to 103 sites
df = df[['SiteID', 'X', 'Y', 'Datetime', 'C', 'RH']]
df = df.rename(columns={'C': 'Temperature'})
df.head()

In [None]:
print(df.info())

print("Latlon bound of dataframe")
df['X'].min(), df['X'].max(), df['Y'].min(), df['Y'].max()

print("Datetime bound of dataframe")
df['Datetime'].min(), df['Datetime'].max()

In [None]:
# Handle 'RH' > 100

# Find MaxRH for each site and date
df['Date'] = df.apply(lambda row: row['Datetime'].date(), axis=1)
df['MaxRH'] = df.groupby(['SiteID', 'Date'])['RH'].transform('max')

# Find MaxRH95 for each site
df['MaxRH95'] = df.groupby(['SiteID'])['MaxRH'].transform(lambda x: x.quantile(0.95))

# Apply correction the RH values. If RH is greater than MaxRH95, RH = 100, else RH = RH*100/MaxRH95
df['RH'] = df.apply(lambda row: 100 if (row['RH'] > row['MaxRH95']) | (row['RH'] < 0) else  row['RH'] * 100 / row['MaxRH95'], axis=1)
df

In [None]:
# Check the correctness of the calculation
# df.groupby(['MaxRH95'])['SiteID'].apply(lambda x: x.unique()).reset_index()

df.drop(columns=['MaxRH', 'MaxRH95', 'Date'], inplace=True)
df.head()

In [None]:
# Calculate VPD

df['VPD'] = df.apply(lambda row: calculate_vpd(row['Temperature'], row['RH']), axis=1)
df.head()

# Get a list of each site (x and y coordinates) and the dates we have data for.

In [None]:
# Group by SiteID, X, Y and collect unique sorted dates
sites_df = (
    df.groupby(['SiteID', 'X', 'Y'])['Datetime']
    .apply(lambda x: ', '.join(sorted(x.dt.strftime('%Y-%m-%d').unique())))
    .reset_index()  # Convert the data in Series to DataFrame
    .rename(columns={'Datetime': 'Dates'})
)
sites_df

In [None]:
start_date_df = (
    df.groupby(['SiteID', 'X', 'Y'])['Datetime']
    .apply(lambda x: min(x).strftime('%Y-%m-%d'))
    .reset_index()  # Convert the data in Series to DataFrame
    .rename(columns={'Datetime': 'start_date'})
)
start_date_df

In [None]:
end_date_df = (
    df.groupby(['SiteID', 'X', 'Y'])['Datetime']
    .apply(lambda x: min(x).strftime('%Y-%m-%d'))
    .reset_index()  # Convert the data in Series to DataFrame
    .rename(columns={'Datetime': 'end_date'})
)
end_date_df

In [None]:
sites_df = sites_df.merge(start_date_df)
sites_df = sites_df.merge(end_date_df)
sites_df

In [None]:
# Save to CSV
sites_df.to_csv(os.path.join(output_dir, 'site_data_summary_phd.csv'), index=False)

# Add biophysical properties into dataframe

In [None]:
!pip install rasterio

## Fill in slope, aspect, and relief of each site

In [None]:
# Add slope, aspect, and relief data
# TODO (LOW) : The biophysical data are retrieved several time for each sites. Make it more efficient.

import rasterio
from rasterio.merge import merge
from rasterio.transform import rowcol
from rasterio.warp import transform
import glob
import os
import numpy as np

def get_value_from_latlon(data, crs, trans, x, y):
    # Find corresponding row and column with latlon
    x_proj, y_proj = transform('EPSG:4326', crs, [x], [y])
    row, col = rowcol(trans, x_proj, y_proj)

    try:
        return data[row, col][0]
    except:
        print(f"There is a row with lon {x}, lat {y} outside tif files' area. The value is set to np.nan")
        return np.nan

def add_data_to_df(property, df):
    # TODO (LOW): There is no more need to merge all tif files.
    # Find all related tif files
    files = glob.glob(os.path.join("..", 'output', f"*{property}.tif"))
    print(f"There are {len(files)} {property} files: {files}")

    # Merge all tif files
    src_files_to_mosaic = [rasterio.open(f) for f in files]
    mosaic, out_trans = merge(src_files_to_mosaic)
    data = mosaic[0]

    # Add data to df
    crs = src_files_to_mosaic[0].crs
    # # TODO (LOW): I was trying to resolving rows and cols calculation redundancy
    # if "temp_row" not in df.columns or "temp_col" not in df.columns:
    #     x_proj, y_proj = transform('EPSG:4326', crs, df['X'], df['Y'])
    #     df['temp_row'], df['temp_col'] = rowcol(out_trans, x_proj, y_proj)
    # df[property] = df.apply(lambda row: data[row['temp_row'], row['temp_col']], axis=1)
    df[property] = df.apply(lambda row: get_value_from_latlon(data, crs, out_trans, row['X'], row['Y']), axis=1)
    return df

In [None]:
sites_df = add_data_to_df('slope', sites_df)
sites_df = add_data_to_df('aspect', sites_df)
sites_df = add_data_to_df('relief', sites_df)
sites_df

In [None]:
# Investigate missing values
# TODO (MEDIUM): There are 5 sites in total with NaNs. Explore whether we can leave it as is.
# NOTE: Site id 251 has row: [23133], col: [1081].
sites_with_nans_df = sites_df[(sites_df['slope'].isna()) | (sites_df['aspect'].isna()) | (sites_df['relief'].isna())]
sites_with_nans_df

In [None]:
# Add the properties to df
df = df.merge(sites_df[["SiteID", "slope", "aspect", "relief"]], on=['SiteID'], how='left')
df

In [None]:
df.info()

In [None]:
df = df[df['SiteID'] != 251]
df.info()

## Fill in vegetation cover

In [None]:
# Load veg cover NetCFD
# TODO(LOW): Correct the incorrectly saved 'crs' attribute of veg cover NetCFD from ARE NCI

import xarray as xr

veg_cover_data_path = os.path.join("..", "Data", "Vegetation_cover", "veg_cover_phd.nc")
veg_ds = xr.open_dataset(veg_cover_data_path)
print("The actual crs of the veg_ds is EPSG:32754. The attribute was incorrectly saved when downloading the data.")
veg_ds

In [None]:
# Visualise veg_cover of the first time step

import matplotlib.pyplot as plt

veg_ds["veg_cover"].isel(time=0).plot()
plt.title("veg_cover layer (e.g. time=0)")
plt.show()

In [None]:
# Find coordinate of each observations in EPSG:32754

from pyproj import Transformer

transformer = Transformer.from_crs("EPSG:4326", "EPSG:32754", always_xy=True)
df['dea_x'], df['dea_y'] = transformer.transform(df['X'].values, df['Y'].values)
df.head()

In [None]:
# Fill in veg cover in df row by row

def get_veg_cover(row):
    x = row['dea_x']
    y = row['dea_y']
    t = row['Datetime']

    try:
        # Select data at the given location across all times
        point_series = veg_ds.sel(x=x, y=y, method='nearest')['veg_cover']

        # Drop NaNs
        valid_series = point_series.dropna(dim='time')

        if valid_series.sizes['time'] == 0:
            return np.nan

        # Find the nearest time, within 30 days, with non-NaN veg_cover
        time_deltas = np.abs(valid_series['time'] - np.datetime64(t))
        if time_deltas.min() > np.timedelta64(30, 'D'):
            print("min time_deltas: ", time_deltas.min() / np.timedelta64(1, 'D'))
            return np.nan
        nearest_idx = time_deltas.argmin().item()
        veg_cover = valid_series.isel(time=nearest_idx).item()

        return veg_cover if veg_cover < 100 else 100

    except Exception:
        return np.nan

df['veg_cover'] = df.apply(get_veg_cover, axis=1)
df.head()

In [None]:
df.drop(columns=['dea_x', 'dea_y'], inplace=True)
df

# Save df to csv

In [None]:
# save df
df.to_csv(os.path.join(output_dir, 'in-situ_topography_phd.csv'), index=False)
df

In [None]:
stop

# Notes

## Investigation of correctness of codes above


In [None]:
# Check for accuracy of the veg_cover section

# Check for NaN and deal with it
df.isna().sum()  # 9661 rows

# Check for sites with some rows with NaN veg_cover
df[df["veg_cover"].isna()]["SiteID"].unique()  # Not all sites

# Check for sites with all rows with NaN veg_cover
nan_sites = df.groupby("SiteID")["veg_cover"].apply(lambda x: x.isna().all())
nan_siteIDs = nan_sites[nan_sites].index.tolist()
nan_siteIDs  # 0 site

# Inspect different dates
df[df['Datetime'].dt.strftime('%Y-%m-%d') == '2018-12-21']  # 120 rows
df[df['Datetime'].dt.strftime('%Y-%m-%d') == '2019-01-20'].isna().sum()  # 48 rows
df[df['Datetime'].dt.strftime('%Y-%m-%d') == '2019-01-20']  # 120 rows

In [None]:
# Investigate whether I need to redownload the veg_cover data from NCI because of its insufficient coverage

# Check whether the veg_cover of SiteID 67, 68, 69, 70 on 2018-12-21 are NaN or not
df = pd.read_csv(os.path.join(output_dir, 'in-situ_topography_with_NaN_veg_cover.csv'))
df['Datetime'] = pd.to_datetime(df['Datetime'])
first_date_df = df[df['SiteID'].isin([67, 68, 69, 70]) & (df['Datetime'].dt.strftime('%Y-%m-%d') == '2018-12-21')]
first_date_df['SiteID'].unique()

# Check the last date
last_date_df = df[df['SiteID'].isin([302, 303, 304, 305]) & (df['Datetime'].dt.strftime('%Y-%m-%d') == '2020-11-06')]
last_date_df['SiteID'].unique()

In [None]:
# Investigate site 302

df = pd.read_csv(os.path.join(output_dir, 'in-situ_topography.csv'))
df['Datetime'] = pd.to_datetime(df['Datetime'])
df[df['SiteID'].isin([302]) & (df['Datetime'].dt.strftime('%Y-%m-%d') == '2020-11-06')]

In [None]:
# Investigate change in vegetation cover.
from pyproj import Transformer

transformer = Transformer.from_crs("EPSG:4326", "EPSG:32754", always_xy=True)
dea_x, dea_y = transformer.transform(150.3096, -35.46728)
point_series = veg_ds.sel(x=dea_x, y=dea_y, method='nearest')['veg_cover']
print(point_series.values)

# visualise the point_series
point_series.plot()
plt.title("Vegetation cover of site 302")
plt.show()

In [None]:
import matplotlib.pyplot as plt

dem_path = os.path.join("..", "output", "batemans_bay_dem_no_neg.tif")
with rasterio.open(dem_path) as src:
    dem_data = src.read(1)

    lon = sites_with_nans_df['X'].tolist()
    lat = sites_with_nans_df['Y'].tolist()
    x_proj, y_proj = transform('EPSG:4326', src.crs, lon, lat)
    rows, cols = rowcol(src.transform, x_proj, y_proj)

plt.imshow(dem_data, cmap='terrain')
plt.scatter(cols, rows, color='red', s=10)
plt.colorbar(label='DEM (metres)')
plt.title('DEM Map')
plt.show()

## Others

In [None]:
url = "https://data.dea.ga.gov.au/derivative/ga_ls_fc_3/2-5-1/090/084/2015/01/02/ga_ls_fc_3_090084_2015-01-02_final_veg_cover.tif"
download_file(url, os.path.join("..", "data", "test.tif"))

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.imshow(mosaic[0], cmap='terrain')
plt.colorbar(label='Slope (degrees)')
plt.title('Slope Map')
plt.show()

In [None]:
# TODO (HIGH): Update df with the new code. Below is not usable.
# add_data_to_df('slope')
# orig_len = len(df)
# df = df[(df['slope'] != np.nan) & (df['slope'] != -9999.0)]
# print("Remove rows with no slope data.")
# print(f"There are {orig_len - len(df)} rows out of {orig_len} rows with no slope data.")
# add_data_to_df('aspect')
# add_data_to_df('relief')

In [None]:
# Visualise relief calculated
# TODO (LOW): Correct scaling of the plot

import matplotlib.pyplot as plt

relief_path = os.path.join("..", "output", "batemans_bay_relief.tif")
with rasterio.open(relief_path) as src:
    relief_data = src.read(1)

    lon = sites_with_nans_df['X'].tolist()
    lat = sites_with_nans_df['Y'].tolist()
    x_proj, y_proj = transform('EPSG:4326', src.crs, lon, lat)
    rows, cols = rowcol(src.transform, x_proj, y_proj)

plt.imshow(relief_data, cmap='terrain')
plt.scatter(cols, rows, color='red', s=10)
plt.colorbar(label='Relief (metres)')
plt.title('Relief Map')
plt.show()