# Start

This script prepares vapor pressure deficit (VPD) data from BARRA2 and matches it with VPD from field observations for subsequent analysis.

📄 **What this script does**
1. Loads cleaned field data with topography — for example, output from `Nick_phd_data_complilation.ipynb`.
2. Downloads and explores **BARRA2 data** (`tas` for temperature and `hurs` for relative humidity) for the grid cells closest to the field sites, spanning from the first to the last month of the field observations.
3. Matches BARRA2 temperature and RH values to the field observations based on the nearest grid cell and UTC timestamp.
4. Calculates **VPD** (vapor pressure deficit) from the matched BARRA2 temperature and RH.
5. Saves the combined field and BARRA2 data as `barra2.csv` in the `output/csv` folder.

⚠️ **Important notes**
* Before running the script, set all variables in the **first cell**, and delete the **second cell** if not using a Google Colab environment.  
  *(The script was developed for use in Google Colab and has not been tested outside of it.)*
* The field data timestamps must be in **Australian local time**. The script will convert them to **UTC** before matching with BARRA2 data.
* BARRA2's temperature and RH are **instantaneous** readings.
* Field observations with **ambiguous timestamps** (e.g., during the daylight saving transition from AEDT to AEST) will be **excluded** from the output.




In [None]:
colab = False
input_file_name = 'in-situ_topography_phd.csv'
output_file_name = 'barra2_vpd_phd.csv'

download_barra2_data = False  # around 30 min in Colab
explore_barra2_data = False

In [None]:
# Setting the main working directory

if colab:
    working_dir = "/content/drive/My Drive/Work/2025.04 ANU Bushfire"
    from google.colab import drive
    drive.mount('/content/drive')
else:
    working_dir = ".."
    import sys
    sys.path.append('..')

# Loading in-situ and remote data


In [None]:
# Load in-situ_topography.csv as the main df

import pandas as pd
import os

df = pd.read_csv(os.path.join(working_dir, "output", "csv", input_file_name))
df['Datetime'] = pd.to_datetime(df['Datetime'])

first_datetime, last_datetime = min(df['Datetime']), (max(df['Datetime']) + pd.Timedelta(days=1))
print("First date: ", first_datetime.strftime("%Y%m%d"), ", last date: ", last_datetime.strftime("%Y%m%d"))

df.head()

## Downloading BARRA2 data

In [None]:
# Find barra2 grid cell coordinates
import urllib.request
import xarray as xr

# Download one barra2 file to extract grid cells
url = "https://thredds.nci.org.au/thredds/fileServer/ob53/output/reanalysis/AUST-04/BOM/ERA5/historical/hres/BARRA-C2/v1/1hr/tas/latest/tas_AUST-04_ERA5_historical_hres_BOM_BARRA-C2_v1_1hr_201812-201812.nc"
sampled_barra2_file_path = os.path.join(working_dir, "Data", "barra2_tas_201812.nc")
if not os.path.exists(sampled_barra2_file_path):
    urllib.request.urlretrieve(url, sampled_barra2_file_path)

ds = xr.open_dataset(sampled_barra2_file_path)

lats = ds['lat'].values
lons = ds['lon'].values
print("lats: ", lats)
print("lons: ", lons)

In [None]:
# Make a list of locations of all sites and reduce it to barra2 grid cell locations

import numpy as np

def find_nearest_grid_point(x, y, lon_grid, lat_grid):
    nearest_lon = lon_grid[np.abs(lon_grid - x).argmin()]
    nearest_lat = lat_grid[np.abs(lat_grid - y).argmin()]
    return nearest_lon, nearest_lat

df[['barra2_X', 'barra2_Y']] = df.apply(
    lambda row: pd.Series(find_nearest_grid_point(row['X'], row['Y'], lons, lats)),
    axis=1
)
barra2_cell_locations_list = list(set((x, y) for x, y in df[['barra2_X', 'barra2_Y']].values))
print("barra2_cell_locations_list length: ", len(barra2_cell_locations_list))
print("barra2_cell_locations_list: ", [(str(x), str(y)) for x, y in barra2_cell_locations_list])

In [None]:
 # Use Python loop and curl to download all barra2 data

import urllib
import calendar
from datetime import datetime, timedelta

if download_barra2_data:
    barra2_data_dir = os.path.join(working_dir, "Data/barra2")
    os.makedirs(barra2_data_dir, exist_ok=True)

    for i, (x, y) in enumerate(barra2_cell_locations_list):
        print(f"Downloading data for barra2 cell coordinate {i + 1} of {len(barra2_cell_locations_list)}")
        current_dt = first_datetime.replace(day=1)

        while current_dt <= last_datetime:
            year = current_dt.year
            month = current_dt.month
            yyyymm = f"{year}{month:02d}"

            # Determine time_start and time_end for this month
            month_start = current_dt
            last_day = calendar.monthrange(year, month)[1]
            month_end = datetime(year, month, last_day)

            raw_start = month_start.strftime('%Y-%m-%dT00:00:00Z')
            raw_end = month_end.strftime('%Y-%m-%dT23:00:00Z')
            time_start = urllib.parse.quote(raw_start, safe='')
            time_end = urllib.parse.quote(raw_end, safe='')

            # Download temperature and relative humidity data
            for var in ['tas', 'hurs']:
                url = f"https://thredds.nci.org.au/thredds/ncss/grid/ob53/output/reanalysis/AUST-04/BOM/ERA5/historical/hres/BARRA-C2/v1/1hr/{var}/latest/{var}_AUST-04_ERA5_historical_hres_BOM_BARRA-C2_v1_1hr_{yyyymm}-{yyyymm}.nc?var={var}&latitude={y}&longitude={x}&time_start={time_start}&time_end={time_end}&timeStride=&vertCoord=&accept=csv"
                output_file_path = os.path.join(barra2_data_dir, f"barra2_data_{var}_{x}_{y}_{yyyymm}.csv")
                print(f"Downloading {output_file_path} from {url}")
                !curl -s -L "{url}" -o "{output_file_path}" -C -

            # Go to next month
            if month == 12:
                current_dt = datetime(year + 1, 1, 1)
            else:
                current_dt = datetime(year, month + 1, 1)

In [None]:
# # BARRA2 data exploration

# barra2_data_dir = os.path.join(working_dir, "Data/barra2")
# barra2_df = pd.read_csv(os.path.join(barra2_data_dir, os.listdir(barra2_data_dir)[0]))
# barra2_df.info()

In [None]:
# barra2_df.head()

In [None]:
# # BARRA2 netCDF data exploration

# import xarray as xr

# if explore_barra2_data:
#     tas_url = "https://thredds.nci.org.au/thredds/fileServer/ob53/output/reanalysis/AUST-04/BOM/ERA5/historical/hres/BARRA-C2/v1/1hr/tas/latest/tas_AUST-04_ERA5_historical_hres_BOM_BARRA-C2_v1_1hr_201812-201812.nc"
#     hurs_url = "https://thredds.nci.org.au/thredds/fileServer/ob53/output/reanalysis/AUST-04/BOM/ERA5/historical/hres/BARRA-C2/v1/1hr/hurs/latest/hurs_AUST-04_ERA5_historical_hres_BOM_BARRA-C2_v1_1hr_201812-201812.nc"
#     !curl -L {tas_url} -o "barra2_tas.nc"
#     !curl -L {hurs_url} -o "barra2_hurs.nc"

#     tas_ds = xr.open_dataset("barra2_tas.nc")
#     hurs_ds = xr.open_dataset("barra2_hurs.nc")
#     tas_ds

In [None]:
# if explore_barra2_data:
#     hurs_ds

# Combining in-situ and remote data into a single dataframe

In [None]:
# Generate UTC_Datetime for in-situ observations

from pytz import timezone

aus_tz = timezone('Australia/Sydney')
df['Datetime'] = df['Datetime'].dt.tz_localize(aus_tz, ambiguous='NaT', nonexistent='NaT')
df = df[~df['Datetime'].isna()]
df['UTC_Datetime'] = df['Datetime'].dt.tz_convert('UTC')

df.head()

In [None]:
# For each row, open barra2 data csv file one-by-one to get data (40 min)

from scipy.constants import convert_temperature

def get_barra2_value(row, var):
    yyyymm = row['UTC_Datetime'].round('h').strftime('%Y%m')
    file_path = os.path.join(barra2_data_dir, f"barra2_data_{var}_{row['barra2_X']}_{row['barra2_Y']}_{yyyymm}.csv")
    df_barra2 = pd.read_csv(file_path)

    target_time = row['UTC_Datetime'].round('h').strftime('%Y-%m-%dT%H:%M:%SZ')
    column_name = df_barra2.columns[df_barra2.columns.str.contains(var)]
    barra2_value = df_barra2.loc[df_barra2['time'] == target_time, column_name].values[0][0]
    if var == 'tas':
        barra2_value = convert_temperature(barra2_value, 'Kelvin', 'Celsius')

    return round(barra2_value, 3)

barra2_data_dir = os.path.join(working_dir, "Data/barra2")
df['barra2_Temperature'] = df.apply(lambda row: get_barra2_value(row, 'tas'), axis=1)
df['barra2_RH'] = df.apply(lambda row: get_barra2_value(row, 'hurs'), axis=1)
df.head()

In [None]:
# Investigate null values.
df[df.isna().any(axis=1)]

# Calculating remote VPD from remote temperature and remote relative humidity

In [None]:
from Utils.vpd import calculate_vpd

df['barra2_VPD'] = df.apply(lambda row: calculate_vpd(row['barra2_Temperature'], row['barra2_RH']), axis=1)
df.head()

# Save the resulting dataframes

In [None]:
df.to_csv(os.path.join(working_dir, "output", "csv", output_file_name), index=False)