# Start


This script prepares vapor pressure deficit (VPD) data from SILO and matches it with VPD from field observations for subsequent analysis.


üìÑ **What this script does**
1. Loads cleaned field data with topography ‚Äî for example, output from `Nick_phd_data_complilation.ipynb`.
2. Downloads **SILO data** for the grid cells closest to the field sites, spanning from the first day to one day after the last day of the field observations.
3. Find the field observations with the highest and lowest temperature between 9am to 9am of each day and put them separately in `df_max_temp` and `df_min_temp`. When there are multiple observations with the highest or lowest temperatures, the ones with the lowest RH are selected.
4. Matches SILO temperature and RH values to the field observations in `df_max_temp` and `df_min_temp` based on the nearest SILO grid cell and `silo_observation_date`.
5. For both `df_max_temp` and `df_min_temp`, calculates **VPD** (vapor pressure deficit) from the matched SILO temperature and RH.
6. Saves the combined field and SILO data as `silo_max_temp.csv` and `silo_min_temp.csv` in the `output/csv` folder.


‚ö†Ô∏è **Important notes**
* Before running the script, set all variables in the **first cell**, and delete the **second cell** if not using a Google Colab environment.  
  *(The script was developed for use in Google Colab and has not been tested outside of it.)*
* The **first and last observation dates** of each site are **disregarded** because there is no temperature data for the whole day for those dates.
* Obsevations with timestamps between 8.59 am of 1 April 2020 to 9.00 am of 2 April 202 have `silo_observation_date` as 2 April 2020.


In [None]:
site_data_file_name = "in-situ_topography_phd.csv"
working_dir = "/content/drive/My Drive/Work/2025.04 ANU Bushfire"
download_silo_data = False

In [None]:
# Connect to Google Drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading in-situ and remote data


In [None]:
# Load in-situ_topography.csv as the main df

import pandas as pd
import os

df = pd.read_csv(os.path.join(working_dir, "output", "csv", site_data_file_name))
df['Datetime'] = pd.to_datetime(df['Datetime'])

first_date, day_after_last_date = min(df['Datetime']).strftime("%Y%m%d"), (max(df['Datetime']) + pd.Timedelta(days=1)).strftime("%Y%m%d")
print("First date: ", first_date, ", One day after the last date: ", day_after_last_date)

df.head()

First date:  20181221 , One day after the last date:  20201107


Unnamed: 0,SiteID,X,Y,Datetime,Temperature,RH,VPD,slope,aspect,relief,veg_cover
0,79,150.2953,-35.4825,2019-01-20 04:00:00,17.9,97.5,0.0509,7.77,187.3,27.34,96.0
1,79,150.2953,-35.4825,2019-01-20 05:00:00,17.8,97.9,0.043,7.77,187.3,27.34,96.0
2,79,150.2953,-35.4825,2019-01-20 06:00:00,17.9,98.2,0.0361,7.77,187.3,27.34,96.0
3,79,150.2953,-35.4825,2019-01-20 07:00:00,18.1,97.4,0.0538,7.77,187.3,27.34,96.0
4,79,150.2953,-35.4825,2019-01-20 08:00:00,18.4,97.9,0.0448,7.77,187.3,27.34,96.0


In [None]:
# Load silo data

def nearest_silo_cell(x, y):
    return (round(round(x / 0.05) * 0.05, 3), round(round(y / 0.05) * 0.05, 3))

# Make a list of locations of all sites and reduce it to silo grid cell locations
df['silo_X'] = df.apply(lambda row: nearest_silo_cell(row['X'], row['Y'])[0], axis=1)
df['silo_Y'] = df.apply(lambda row: nearest_silo_cell(row['X'], row['Y'])[1], axis=1)
silo_cell_locations_list = list(set((x, y) for x, y in df[['silo_X', 'silo_Y']].values))
print("silo_cell_locations_list: ", silo_cell_locations_list)

# Use Python loop and curl to download all silo data
silo_data_dir = os.path.join(working_dir, "Data/silo")
if download_silo_data:
    os.makedirs(silo_data_dir, exist_ok=True)
    for x, y in silo_cell_locations_list:
        output_file_path = os.path.join(silo_data_dir, f"silo_data_{x}_{y}.csv")
        url = f"https://www.longpaddock.qld.gov.au/cgi-bin/silo/DataDrillDataset.php?lat={y}&lon={x}&format=csv&start={first_date}&finish={day_after_last_date}&username=noemail@net.com&dataset=Official&comment=xnhg"
        print(f"Downloading {url} to {output_file_path}")
        !curl -L "{url}" -o "{output_file_path}" -C -

silo_cell_locations_list:  [(np.float64(150.15), np.float64(-35.75)), (np.float64(150.05), np.float64(-36.15)), (np.float64(150.2), np.float64(-35.75)), (np.float64(150.3), np.float64(-35.45)), (np.float64(150.1), np.float64(-35.55)), (np.float64(150.35), np.float64(-35.45)), (np.float64(150.25), np.float64(-35.6)), (np.float64(150.15), np.float64(-35.8)), (np.float64(150.15), np.float64(-35.55)), (np.float64(150.3), np.float64(-35.5)), (np.float64(150.1), np.float64(-35.8)), (np.float64(150.25), np.float64(-35.55)), (np.float64(150.2), np.float64(-35.65))]


# Combining in-situ and remote data into a single dataframe

In [None]:
# Find in-situ data's hottest and coldest observation of each day

# Assign silo_observation_date to each observation
mask = (df['Datetime'].dt.hour > 9) | ((df['Datetime'].dt.hour == 9) & (df['Datetime'].dt.minute > 0))
df.loc[mask, 'silo_observation_date'] = df.loc[mask, 'Datetime'] + pd.Timedelta(days=1)
df.loc[~mask, 'silo_observation_date'] = df.loc[~mask, 'Datetime']
df['silo_observation_date'] = pd.to_datetime(df['silo_observation_date']).dt.date

# Remove rows of the first and last dates of each site because those days lack some hours
first_last_dates = df.groupby('SiteID')['silo_observation_date'].agg(['min', 'max']).reset_index()
df = df.merge(first_last_dates, on='SiteID', how='left')
df_date_filtered = df[(df['silo_observation_date'] != df['min']) & (df['silo_observation_date'] != df['max'])]
df_date_filtered = df_date_filtered.drop(columns=['min', 'max'])

# Select only rows with max tempurature of the day
df_date_filtered['Day_max_temp'] = df_date_filtered.groupby(['SiteID', 'silo_observation_date'])['Temperature'].transform('max')
df_max_temp = df_date_filtered[df_date_filtered['Temperature'] == df_date_filtered['Day_max_temp']]
print("In-situ day maximum temperature observations:")
print(df_max_temp.head())
print('\n')

# Select only rows with min tempurature of the day
df_date_filtered['Day_min_temp'] = df_date_filtered.groupby(['SiteID', 'silo_observation_date'])['Temperature'].transform('min')
df_min_temp = df_date_filtered[df_date_filtered['Temperature'] == df_date_filtered['Day_min_temp']]
print("In-situ day minimum temperature observations:")
df_min_temp.head()

# -------------------------------------- Investigation code below --------------------------------------
# df_max_temp[df_max_temp['Datetime'].dt.hour < 9]

# import datetime
# df[(df['SiteID'] == 79) & (df['Date'] == datetime.date(2019, 1, 30))]
# df[(df['SiteID'] == 90) & (df['Date'] == datetime.date(2019, 2, 19))]  ## Hottest in the morning
# df[(df['SiteID'] == 90) & (df['Date'] == datetime.date(2019, 3, 6))]
# df[(df['SiteID'] == 142) & (df['Date'] == datetime.date(2019, 3, 27))]
# df[(df['SiteID'] == 78) & (df['Date'] == datetime.date(2019, 1, 19))]

# df_date_filtered.groupby(['SiteID', 'Date'])['Datetime'].apply(lambda x: x.nunique())

In-situ day maximum temperature observations:
     SiteID         X        Y            Datetime  Temperature    RH     VPD  \
10       79  150.2953 -35.4825 2019-01-20 14:00:00         23.8  80.3  0.5818   
34       79  150.2953 -35.4825 2019-01-21 14:00:00         27.3  69.9  1.0920   
59       79  150.2953 -35.4825 2019-01-22 15:00:00         32.9  57.3  2.1317   
81       79  150.2953 -35.4825 2019-01-23 13:00:00         24.4  69.1  0.9430   
109      79  150.2953 -35.4825 2019-01-24 17:00:00         22.6  81.1  0.5200   

     slope  aspect  relief  veg_cover  silo_X  silo_Y silo_observation_date  \
10    7.77   187.3   27.34       96.0   150.3   -35.5            2019-01-21   
34    7.77   187.3   27.34       96.0   150.3   -35.5            2019-01-22   
59    7.77   187.3   27.34       85.0   150.3   -35.5            2019-01-23   
81    7.77   187.3   27.34       85.0   150.3   -35.5            2019-01-24   
109   7.77   187.3   27.34       85.0   150.3   -35.5            2019-01

Unnamed: 0,SiteID,X,Y,Datetime,Temperature,RH,VPD,slope,aspect,relief,veg_cover,silo_X,silo_Y,silo_observation_date,Day_max_temp,Day_min_temp
1,79,150.2953,-35.4825,2019-01-20 05:00:00,17.8,97.9,0.043,7.77,187.3,27.34,96.0,150.3,-35.5,2019-01-20,20.3,17.8
24,79,150.2953,-35.4825,2019-01-21 04:00:00,19.4,95.9,0.0937,7.77,187.3,27.34,96.0,150.3,-35.5,2019-01-21,23.8,19.4
51,79,150.2953,-35.4825,2019-01-22 07:00:00,19.3,95.7,0.0956,7.77,187.3,27.34,96.0,150.3,-35.5,2019-01-22,27.3,19.3
74,79,150.2953,-35.4825,2019-01-23 06:00:00,21.6,97.0,0.0763,7.77,187.3,27.34,85.0,150.3,-35.5,2019-01-23,32.9,21.6
97,79,150.2953,-35.4825,2019-01-24 05:00:00,19.1,97.3,0.0601,7.77,187.3,27.34,85.0,150.3,-35.5,2019-01-24,24.4,19.1


In [None]:
# Use only one observation per sit per day

df_max_temp = df_max_temp.loc[
    df_max_temp.groupby(['SiteID', 'silo_observation_date'])['RH'].idxmin()
]
df_min_temp = df_min_temp.loc[
    df_min_temp.groupby(['SiteID', 'silo_observation_date'])['RH'].idxmin()
]
df_max_temp.head()

# Investigating the number of obs per site per day
# df_max_temp.groupby(['SiteID', 'silo_observation_date'])['Datetime'].apply(lambda x: x.nunique())
# Check the accuracy of the code
# import datetime
# df_max_temp[(df_max_temp['SiteID'] == 264) & (df_max_temp['silo_observation_date'] == datetime.date(2020, 3, 21))]

Unnamed: 0,SiteID,X,Y,Datetime,Temperature,RH,VPD,slope,aspect,relief,veg_cover,silo_X,silo_Y,silo_observation_date,Day_max_temp
7235,67,150.2697,-35.5448,2018-12-22 12:15:00,25.4,47.0,1.716,9.81,91.5,63.24,86.7,150.25,-35.55,2018-12-23,25.4
7259,67,150.2697,-35.5448,2018-12-23 12:15:00,26.1,43.2,1.9212,9.81,91.5,63.24,86.7,150.25,-35.55,2018-12-24,26.1
7282,67,150.2697,-35.5448,2018-12-24 11:15:00,30.4,39.7,2.6111,9.81,91.5,63.24,86.7,150.25,-35.55,2018-12-25,30.4
7306,67,150.2697,-35.5448,2018-12-25 11:15:00,38.5,22.2,5.2871,9.81,91.5,63.24,86.7,150.25,-35.55,2018-12-26,38.5
7330,67,150.2697,-35.5448,2018-12-26 11:15:00,32.2,47.3,2.5396,9.81,91.5,63.24,86.7,150.25,-35.55,2018-12-27,32.2


In [None]:
# Fill in df with silo data at the hottest time of the day (max_temp, RH_tmax)

# For each row, open silo data csv file one-by-one to get data
def get_silo_value(row, column):
    file_path = os.path.join(silo_data_dir, f"silo_data_{row['silo_X']}_{row['silo_Y']}.csv")
    df_silo = pd.read_csv(file_path)

    df_silo['Datetime'] = pd.to_datetime(df_silo['YYYY-MM-DD'], format='%Y-%m-%d')
    target_date = row['silo_observation_date']
    silo_value = df_silo.loc[df_silo['Datetime'].dt.date == target_date, column].values[0]

    return silo_value

df_max_temp['silo_Temperature'] = df_max_temp.apply(lambda row: get_silo_value(row, 'max_temp'), axis=1)
df_max_temp['silo_RH'] = df_max_temp.apply(lambda row: get_silo_value(row, 'rh_tmax'), axis=1)
df_max_temp.head()

Unnamed: 0,SiteID,X,Y,Datetime,Temperature,RH,VPD,slope,aspect,relief,veg_cover,silo_X,silo_Y,silo_observation_date,Day_max_temp,silo_Temperature,silo_RH
7235,67,150.2697,-35.5448,2018-12-22 12:15:00,25.4,47.0,1.716,9.81,91.5,63.24,86.7,150.25,-35.55,2018-12-23,25.4,22.3,58.3
7259,67,150.2697,-35.5448,2018-12-23 12:15:00,26.1,43.2,1.9212,9.81,91.5,63.24,86.7,150.25,-35.55,2018-12-24,26.1,24.5,50.1
7282,67,150.2697,-35.5448,2018-12-24 11:15:00,30.4,39.7,2.6111,9.81,91.5,63.24,86.7,150.25,-35.55,2018-12-25,30.4,26.6,52.6
7306,67,150.2697,-35.5448,2018-12-25 11:15:00,38.5,22.2,5.2871,9.81,91.5,63.24,86.7,150.25,-35.55,2018-12-26,38.5,26.9,60.7
7330,67,150.2697,-35.5448,2018-12-26 11:15:00,32.2,47.3,2.5396,9.81,91.5,63.24,86.7,150.25,-35.55,2018-12-27,32.2,30.0,54.4


In [None]:
# Fill in df with silo data at the coldest time of the day (min_temp, RH_tmin)

df_min_temp['silo_Temperature'] = df_min_temp.apply(lambda row: get_silo_value(row, 'min_temp'), axis=1)
df_min_temp['silo_RH'] = df_min_temp.apply(lambda row: get_silo_value(row, 'rh_tmin'), axis=1)
df_min_temp.head()

Unnamed: 0,SiteID,X,Y,Datetime,Temperature,RH,VPD,slope,aspect,relief,veg_cover,silo_X,silo_Y,silo_observation_date,Day_max_temp,Day_min_temp,silo_Temperature,silo_RH
7251,67,150.2697,-35.5448,2018-12-23 04:15:00,11.7,93.4,0.0915,9.81,91.5,63.24,86.7,150.25,-35.55,2018-12-23,25.4,11.7,11.9,100.0
7275,67,150.2697,-35.5448,2018-12-24 04:15:00,11.4,96.6,0.046,9.81,91.5,63.24,86.7,150.25,-35.55,2018-12-24,26.1,11.4,11.4,100.0
7300,67,150.2697,-35.5448,2018-12-25 05:15:00,14.5,96.6,0.0562,9.81,91.5,63.24,86.7,150.25,-35.55,2018-12-25,30.4,14.5,15.4,100.0
7320,67,150.2697,-35.5448,2018-12-26 01:15:00,18.2,95.5,0.0935,9.81,91.5,63.24,86.7,150.25,-35.55,2018-12-26,38.5,18.2,17.3,100.0
7348,67,150.2697,-35.5448,2018-12-27 05:15:00,18.1,97.9,0.0438,9.81,91.5,63.24,86.7,150.25,-35.55,2018-12-27,32.2,18.1,17.3,100.0


In [None]:
# Investigate null values. Site 251 is located in the ocean.
print("df_max_temp length: ", len(df_max_temp))
print("df_min_temp length: ", len(df_min_temp))
df_max_temp[(df_max_temp.isna().any(axis=1)) & (df_max_temp['SiteID'] != 251)]
df_min_temp[(df_min_temp.isna().any(axis=1)) & (df_min_temp['SiteID'] != 251)]

df_max_temp length:  1047
df_min_temp length:  1047


Unnamed: 0,SiteID,X,Y,Datetime,Temperature,RH,VPD,slope,aspect,relief,veg_cover,silo_X,silo_Y,silo_observation_date,Day_max_temp,Day_min_temp,silo_Temperature,silo_RH
9708,74,150.3074,-35.4805,2019-01-05 09:00:00,23.2,73.5,0.7510,14.28,329.5,61.14,,150.3,-35.5,2019-01-05,35.2,23.2,20.7,88.1
9728,74,150.3074,-35.4805,2019-01-06 05:00:00,13.1,99.5,0.0070,14.28,329.5,61.14,,150.3,-35.5,2019-01-06,25.7,13.1,13.7,100.0
9751,74,150.3074,-35.4805,2019-01-07 04:00:00,14.6,94.3,0.0942,14.28,329.5,61.14,,150.3,-35.5,2019-01-07,22.7,14.6,14.5,100.0
9757,74,150.3074,-35.4805,2019-01-07 10:00:00,18.7,83.5,0.3555,14.28,329.5,61.14,,150.3,-35.5,2019-01-08,21.2,18.7,18.7,100.0
9802,74,150.3074,-35.4805,2019-01-09 07:00:00,18.2,96.3,0.0781,14.28,329.5,61.14,,150.3,-35.5,2019-01-09,30.7,18.2,17.7,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20831,284,150.3117,-35.4767,2020-09-16 03:09:00,13.6,95.2,0.0745,22.67,4.3,101.27,,150.3,-35.5,2020-09-16,23.6,13.6,10.5,100.0
20958,284,150.3117,-35.4767,2020-09-17 00:19:00,19.1,49.8,1.1095,22.67,4.3,101.27,,150.3,-35.5,2020-09-17,27.6,19.1,13.5,98.9
21124,284,150.3117,-35.4767,2020-09-18 03:59:00,11.1,96.2,0.0504,22.67,4.3,101.27,,150.3,-35.5,2020-09-18,25.1,11.1,11.2,100.0
21161,284,150.3117,-35.4767,2020-09-18 10:09:00,12.1,97.6,0.0335,22.67,4.3,101.27,,150.3,-35.5,2020-09-19,15.1,12.1,12.6,100.0


# Calculating remote VPD from remote temperature and remote relative humidity

In [None]:
# Write a function for deriving VPD from temp and RH

import math
import numpy as np

def calculate_vpd(temp, rh):
    if pd.isna(temp) or pd.isna(rh):
        return np.nan
    es = 0.6108 * math.exp(17.27 * temp / (237.3 + temp))
    e = es * rh / 100
    vpd = es - e
    return vpd

df_max_temp['silo_VPD'] = df_max_temp.apply(lambda row: calculate_vpd(row['silo_Temperature'], row['silo_RH']), axis=1)
df_min_temp['silo_VPD'] = df_min_temp.apply(lambda row: calculate_vpd(row['silo_Temperature'], row['silo_RH']), axis=1)
df_max_temp.head()

Unnamed: 0,SiteID,X,Y,Datetime,Temperature,RH,VPD,slope,aspect,relief,veg_cover,silo_X,silo_Y,silo_observation_date,Day_max_temp,silo_Temperature,silo_RH,silo_VPD
7235,67,150.2697,-35.5448,2018-12-22 12:15:00,25.4,47.0,1.716,9.81,91.5,63.24,86.7,150.25,-35.55,2018-12-23,25.4,22.3,58.3,1.122841
7259,67,150.2697,-35.5448,2018-12-23 12:15:00,26.1,43.2,1.9212,9.81,91.5,63.24,86.7,150.25,-35.55,2018-12-24,26.1,24.5,50.1,1.53425
7282,67,150.2697,-35.5448,2018-12-24 11:15:00,30.4,39.7,2.6111,9.81,91.5,63.24,86.7,150.25,-35.55,2018-12-25,30.4,26.6,52.6,1.650716
7306,67,150.2697,-35.5448,2018-12-25 11:15:00,38.5,22.2,5.2871,9.81,91.5,63.24,86.7,150.25,-35.55,2018-12-26,38.5,26.9,60.7,1.392979
7330,67,150.2697,-35.5448,2018-12-26 11:15:00,32.2,47.3,2.5396,9.81,91.5,63.24,86.7,150.25,-35.55,2018-12-27,32.2,30.0,54.4,1.934838


# Save the resulting dataframes

In [None]:
# Drop helper columns
df_max_temp.drop(columns=['Day_max_temp'], inplace=True)
df_min_temp.drop(columns=['Day_max_temp', 'Day_min_temp'], inplace=True)

In [None]:
# save df
df_max_temp.to_csv(os.path.join(working_dir, "output/csv/silo_max_temp_vpd.csv"), index=False, float_format='%.4f')
df_min_temp.to_csv(os.path.join(working_dir, "output/csv/silo_min_temp_vpd.csv"), index=False, float_format='%.4f')