# Start
What this script does
1.   Load clean field data with topography, ex. output from Nick_phd_data_complilation.ipynb.
2.   Download and explore ‘tas’ and ‘hurs’ BARRA2 data of the BARRA2 grid cells that are the closest to field sites, between the first month and the last month of the field observations.
3.   Add BARRA2’s temperature and RH, that has **the closest location and UTC timing**, to field data observations.
4.   Calculate BARRA2’s VPD from barra2_Temperature and barra2_RH.
5.   Save the matched field and remote data as barra2.csv in the ‘output/csv’ folder of the working directory.

Important notes
*   Before running the script, please set all the variables in the first cell and delete the second cell if Google Colab environment is not used. (The script was made to be used with Google Colab environment and has not been tested outside it)
*   The field data should has Australian time to work with this script, the time will be converted to UTC time before matching with BARRA2 data.
*   BARRA2's temperature and RH are instatanuous data.
*   Field observations with ambiguous Australian time during the conversion from AEDT to AEST are dropped.






In [23]:
working_dir = "/content/drive/My Drive/Work/2025.04 ANU Bushfire"
download_barra2_data = False  # around 30 min
explore_barra2_data = False

In [24]:
# Connect to Google Drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading in-situ and remote data


In [25]:
# Load in-situ_topography.csv as the main df

import pandas as pd
import os

df = pd.read_csv(os.path.join(working_dir, "output/csv/in-situ_topography.csv"))
df['Datetime'] = pd.to_datetime(df['Datetime'])

first_datetime, last_datetime = min(df['Datetime']), (max(df['Datetime']) + pd.Timedelta(days=1))
print("First date: ", first_datetime.strftime("%Y%m%d"), ", last date: ", last_datetime.strftime("%Y%m%d"))

df.head()

First date:  20181221 , last date:  20201107


Unnamed: 0.1,Unnamed: 0,SiteID,X,Y,Datetime,Temperature,RH,VPD,slope,aspect,relief
0,0,79,150.2953,-35.48254,2019-01-20 04:00:00,17.941,97.525473,0.050885,7.76997,187.29736,27.341133
1,1,79,150.2953,-35.48254,2019-01-20 05:00:00,17.753,97.881611,0.043049,7.76997,187.29736,27.341133
2,2,79,150.2953,-35.48254,2019-01-20 06:00:00,17.878,98.236778,0.036114,7.76997,187.29736,27.341133
3,3,79,150.2953,-35.48254,2019-01-20 07:00:00,18.066,97.406114,0.05376,7.76997,187.29736,27.341133
4,4,79,150.2953,-35.48254,2019-01-20 08:00:00,18.379,97.881611,0.044776,7.76997,187.29736,27.341133


## Downloading BARRA2 data

In [26]:
# Find barra2 grid cell coordinates
import xarray as xr

url = "https://thredds.nci.org.au/thredds/fileServer/ob53/output/reanalysis/AUST-04/BOM/ERA5/historical/hres/BARRA-C2/v1/1hr/tas/latest/tas_AUST-04_ERA5_historical_hres_BOM_BARRA-C2_v1_1hr_201812-201812.nc"
ds = xr.open_dataset(url)

lats = ds['lat'].values
lons = ds['lon'].values
print("lats: ", lats)
print("lons: ", lons)

lats:  [-45.69 -45.65 -45.61 ...  -5.09  -5.05  -5.01]
lons:  [108.02 108.06 108.1  ... 159.82 159.86 159.9 ]


In [27]:
# Make a list of locations of all sites and reduce it to barra2 grid cell locations

import numpy as np

def find_nearest_grid_point(x, y, lon_grid, lat_grid):
    nearest_lon = lon_grid[np.abs(lon_grid - x).argmin()]
    nearest_lat = lat_grid[np.abs(lat_grid - y).argmin()]
    return nearest_lon, nearest_lat

df[['barra2_X', 'barra2_Y']] = df.apply(
    lambda row: pd.Series(find_nearest_grid_point(row['X'], row['Y'], lons, lats)),
    axis=1
)
barra2_cell_locations_list = list(set((x, y) for x, y in df[['barra2_X', 'barra2_Y']].values))
print("barra2_cell_locations_list length: ", len(barra2_cell_locations_list))
print("barra2_cell_locations_list: ", [(str(x), str(y)) for x, y in barra2_cell_locations_list])

barra2_cell_locations_list length:  22
barra2_cell_locations_list:  [('150.1', '-35.53'), ('150.14', '-35.57'), ('150.18', '-35.73'), ('150.34', '-35.49'), ('150.3', '-35.45'), ('150.14', '-35.77'), ('150.1', '-35.57'), ('150.26', '-35.57'), ('150.3', '-35.49'), ('150.26', '-35.61'), ('150.02', '-36.13'), ('150.06', '-36.13'), ('150.1', '-35.81'), ('150.18', '-35.77'), ('150.06', '-36.17'), ('150.14', '-35.73'), ('150.22', '-35.65'), ('150.18', '-35.65'), ('150.38', '-35.45'), ('150.22', '-35.69'), ('150.34', '-35.45'), ('150.26', '-35.53')]


In [28]:
 # Use Python loop and curl to download all barra2 data

import urllib
import calendar
from datetime import datetime, timedelta

if download_barra2_data:
    barra2_data_dir = os.path.join(working_dir, "Data/barra2")
    os.makedirs(barra2_data_dir, exist_ok=True)

    for i, (x, y) in enumerate(barra2_cell_locations_list):
        print(f"Downloading data for barra2 cell coordinate {i + 1} of {len(barra2_cell_locations_list)}")
        current_dt = first_datetime.replace(day=1)

        while current_dt <= last_datetime:
            year = current_dt.year
            month = current_dt.month
            yyyymm = f"{year}{month:02d}"

            # Determine time_start and time_end for this month
            month_start = current_dt
            last_day = calendar.monthrange(year, month)[1]
            month_end = datetime(year, month, last_day)

            raw_start = month_start.strftime('%Y-%m-%dT00:00:00Z')
            raw_end = month_end.strftime('%Y-%m-%dT23:00:00Z')
            time_start = urllib.parse.quote(raw_start, safe='')
            time_end = urllib.parse.quote(raw_end, safe='')

            # Download temperature and relative humidity data
            for var in ['tas', 'hurs']:
                url = f"https://thredds.nci.org.au/thredds/ncss/grid/ob53/output/reanalysis/AUST-04/BOM/ERA5/historical/hres/BARRA-C2/v1/1hr/{var}/latest/{var}_AUST-04_ERA5_historical_hres_BOM_BARRA-C2_v1_1hr_{yyyymm}-{yyyymm}.nc?var={var}&latitude={y}&longitude={x}&time_start={time_start}&time_end={time_end}&timeStride=&vertCoord=&accept=csv"
                output_file_path = os.path.join(barra2_data_dir, f"barra2_data_{var}_{x}_{y}_{yyyymm}.csv")
                print(f"Downloading {output_file_path} from {url}")
                !curl -s -L "{url}" -o "{output_file_path}" -C -

            # Go to next month
            if month == 12:
                current_dt = datetime(year + 1, 1, 1)
            else:
                current_dt = datetime(year, month + 1, 1)

In [30]:
# BARRA2 data exploration

barra2_data_dir = os.path.join(working_dir, "Data/barra2")
barra2_df = pd.read_csv(os.path.join(barra2_data_dir, os.listdir(barra2_data_dir)[0]))
barra2_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 5 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   time                            720 non-null    object 
 1   station                         720 non-null    object 
 2   latitude[unit="degrees_north"]  720 non-null    float64
 3   longitude[unit="degrees_east"]  720 non-null    float64
 4   tas[unit="K"]                   720 non-null    float64
dtypes: float64(3), object(2)
memory usage: 28.3+ KB


In [31]:
barra2_df.head()

Unnamed: 0,time,station,"latitude[unit=""degrees_north""]","longitude[unit=""degrees_east""]","tas[unit=""K""]"
0,2019-04-01T00:00:00Z,GridPointRequestedAt[35.570S_150.140E],-35.569,150.14,289.3125
1,2019-04-01T01:00:00Z,GridPointRequestedAt[35.570S_150.140E],-35.569,150.14,289.890625
2,2019-04-01T02:00:00Z,GridPointRequestedAt[35.570S_150.140E],-35.569,150.14,290.796875
3,2019-04-01T03:00:00Z,GridPointRequestedAt[35.570S_150.140E],-35.569,150.14,291.140625
4,2019-04-01T04:00:00Z,GridPointRequestedAt[35.570S_150.140E],-35.569,150.14,291.59375


In [32]:
# BARRA2 netCDF data exploration

import xarray as xr

if explore_barra2_data:
    tas_url = "https://thredds.nci.org.au/thredds/fileServer/ob53/output/reanalysis/AUST-04/BOM/ERA5/historical/hres/BARRA-C2/v1/1hr/tas/latest/tas_AUST-04_ERA5_historical_hres_BOM_BARRA-C2_v1_1hr_201812-201812.nc"
    hurs_url = "https://thredds.nci.org.au/thredds/fileServer/ob53/output/reanalysis/AUST-04/BOM/ERA5/historical/hres/BARRA-C2/v1/1hr/hurs/latest/hurs_AUST-04_ERA5_historical_hres_BOM_BARRA-C2_v1_1hr_201812-201812.nc"
    !curl -L {tas_url} -o "barra2_tas.nc"
    !curl -L {hurs_url} -o "barra2_hurs.nc"

    tas_ds = xr.open_dataset("barra2_tas.nc")
    hurs_ds = xr.open_dataset("barra2_hurs.nc")
    tas_ds

In [33]:
if explore_barra2_data:
    hurs_ds

# Combining in-situ and remote data into a single dataframe

In [34]:
# Generate UTC_Datetime for in-situ observations

from pytz import timezone

aus_tz = timezone('Australia/Sydney')
df['Datetime'] = df['Datetime'].dt.tz_localize(aus_tz, ambiguous='NaT', nonexistent='NaT')
df = df[~df['Datetime'].isna()]
df['UTC_Datetime'] = df['Datetime'].dt.tz_convert('UTC')

df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['UTC_Datetime'] = df['Datetime'].dt.tz_convert('UTC')


Unnamed: 0.1,Unnamed: 0,SiteID,X,Y,Datetime,Temperature,RH,VPD,slope,aspect,relief,barra2_X,barra2_Y,UTC_Datetime
0,0,79,150.2953,-35.48254,2019-01-20 04:00:00+11:00,17.941,97.525473,0.050885,7.76997,187.29736,27.341133,150.3,-35.49,2019-01-19 17:00:00+00:00
1,1,79,150.2953,-35.48254,2019-01-20 05:00:00+11:00,17.753,97.881611,0.043049,7.76997,187.29736,27.341133,150.3,-35.49,2019-01-19 18:00:00+00:00
2,2,79,150.2953,-35.48254,2019-01-20 06:00:00+11:00,17.878,98.236778,0.036114,7.76997,187.29736,27.341133,150.3,-35.49,2019-01-19 19:00:00+00:00
3,3,79,150.2953,-35.48254,2019-01-20 07:00:00+11:00,18.066,97.406114,0.05376,7.76997,187.29736,27.341133,150.3,-35.49,2019-01-19 20:00:00+00:00
4,4,79,150.2953,-35.48254,2019-01-20 08:00:00+11:00,18.379,97.881611,0.044776,7.76997,187.29736,27.341133,150.3,-35.49,2019-01-19 21:00:00+00:00


In [40]:
# For each row, open barra2 data csv file one-by-one to get data (40 min)

from scipy.constants import convert_temperature

def get_barra2_value(row, var):
    yyyymm = row['UTC_Datetime'].round('h').strftime('%Y%m')
    file_path = os.path.join(barra2_data_dir, f"barra2_data_{var}_{row['barra2_X']}_{row['barra2_Y']}_{yyyymm}.csv")
    df_barra2 = pd.read_csv(file_path)

    target_time = row['UTC_Datetime'].round('h').strftime('%Y-%m-%dT%H:%M:%SZ')
    column_name = df_barra2.columns[df_barra2.columns.str.contains(var)]
    barra2_value = df_barra2.loc[df_barra2['time'] == target_time, column_name].values[0][0]
    if var == 'tas':
        barra2_value = convert_temperature(barra2_value, 'Kelvin', 'Celsius')

    return round(barra2_value, 3)

df['barra2_Temperature'] = df.apply(lambda row: get_barra2_value(row, 'tas'), axis=1)
df['barra2_RH'] = df.apply(lambda row: get_barra2_value(row, 'hurs'), axis=1)
df.head()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
target time 2020-03-24T07:00:00Z
                     time                                 station  \
0    2020-03-01T00:00:00Z  GridPointRequestedAt[36.130S_150.060E]   
1    2020-03-01T01:00:00Z  GridPointRequestedAt[36.130S_150.060E]   
2    2020-03-01T02:00:00Z  GridPointRequestedAt[36.130S_150.060E]   
3    2020-03-01T03:00:00Z  GridPointRequestedAt[36.130S_150.060E]   
4    2020-03-01T04:00:00Z  GridPointRequestedAt[36.130S_150.060E]   
..                    ...                                     ...   
739  2020-03-31T19:00:00Z  GridPointRequestedAt[36.130S_150.060E]   
740  2020-03-31T20:00:00Z  GridPointRequestedAt[36.130S_150.060E]   
741  2020-03-31T21:00:00Z  GridPointRequestedAt[36.130S_150.060E]   
742  2020-03-31T22:00:00Z  GridPointRequestedAt[36.130S_150.060E]   
743  2020-03-31T23:00:00Z  GridPointRequestedAt[36.130S_150.060E]   

     latitude[unit="degrees_north"]  longitude[unit="degrees_east"]  \
0 

Unnamed: 0.1,Unnamed: 0,SiteID,X,Y,Datetime,Temperature,RH,VPD,slope,aspect,relief,barra2_X,barra2_Y,UTC_Datetime,barra2_Temperature,barra2_RH
0,0,79,150.2953,-35.48254,2019-01-20 04:00:00+11:00,17.941,97.525473,0.050885,7.76997,187.29736,27.341133,150.3,-35.49,2019-01-19 17:00:00+00:00,18.444,87.379
1,1,79,150.2953,-35.48254,2019-01-20 05:00:00+11:00,17.753,97.881611,0.043049,7.76997,187.29736,27.341133,150.3,-35.49,2019-01-19 18:00:00+00:00,18.288,88.496
2,2,79,150.2953,-35.48254,2019-01-20 06:00:00+11:00,17.878,98.236778,0.036114,7.76997,187.29736,27.341133,150.3,-35.49,2019-01-19 19:00:00+00:00,18.35,88.078
3,3,79,150.2953,-35.48254,2019-01-20 07:00:00+11:00,18.066,97.406114,0.05376,7.76997,187.29736,27.341133,150.3,-35.49,2019-01-19 20:00:00+00:00,18.584,86.7
4,4,79,150.2953,-35.48254,2019-01-20 08:00:00+11:00,18.379,97.881611,0.044776,7.76997,187.29736,27.341133,150.3,-35.49,2019-01-19 21:00:00+00:00,19.225,82.294


In [42]:
# Investigate null values.
df[df.isna().any(axis=1)]

Unnamed: 0.1,Unnamed: 0,SiteID,X,Y,Datetime,Temperature,RH,VPD,slope,aspect,relief,barra2_X,barra2_Y,UTC_Datetime,barra2_Temperature,barra2_RH


# Calculating remote VPD from remote temperature and remote relative humidity

In [43]:
# Write a function for deriving VPD from temp and RH

import math
import numpy as np

def calculate_vpd(temp, rh):
    if pd.isna(temp) or pd.isna(rh):
        return np.nan
    es = 0.6108 * math.exp(17.27 * temp / (237.3 + temp))
    e = es * rh / 100
    vpd = es - e
    return vpd

df['barra2_VPD'] = df.apply(lambda row: calculate_vpd(row['barra2_Temperature'], row['barra2_RH']), axis=1)
df.head()

Unnamed: 0.1,Unnamed: 0,SiteID,X,Y,Datetime,Temperature,RH,VPD,slope,aspect,relief,barra2_X,barra2_Y,UTC_Datetime,barra2_Temperature,barra2_RH,barra2_VPD
0,0,79,150.2953,-35.48254,2019-01-20 04:00:00+11:00,17.941,97.525473,0.050885,7.76997,187.29736,27.341133,150.3,-35.49,2019-01-19 17:00:00+00:00,18.444,87.379,0.267858
1,1,79,150.2953,-35.48254,2019-01-20 05:00:00+11:00,17.753,97.881611,0.043049,7.76997,187.29736,27.341133,150.3,-35.49,2019-01-19 18:00:00+00:00,18.288,88.496,0.241775
2,2,79,150.2953,-35.48254,2019-01-20 06:00:00+11:00,17.878,98.236778,0.036114,7.76997,187.29736,27.341133,150.3,-35.49,2019-01-19 19:00:00+00:00,18.35,88.078,0.251536
3,3,79,150.2953,-35.48254,2019-01-20 07:00:00+11:00,18.066,97.406114,0.05376,7.76997,187.29736,27.341133,150.3,-35.49,2019-01-19 20:00:00+00:00,18.584,86.7,0.284754
4,4,79,150.2953,-35.48254,2019-01-20 08:00:00+11:00,18.379,97.881611,0.044776,7.76997,187.29736,27.341133,150.3,-35.49,2019-01-19 21:00:00+00:00,19.225,82.294,0.394566


# Save the resulting dataframes

In [44]:
df.to_csv(os.path.join(working_dir, "output/csv/barra2.csv"), index=False)