# Start


This script prepares vapor pressure deficit (VPD) data from SILO and matches it with VPD from field observations for subsequent analysis.


📄 **What this script does**
1. Loads cleaned field data with topography — for example, output from `Nick_phd_data_complilation.ipynb`.
2. Downloads **SILO data** for the grid cells closest to the field sites, spanning from the first day to one day after the last day of the field observations.
3. Find the field observations with the highest and lowest temperature between 9am to 9am of each day and put them separately in `df_max_temp` and `df_min_temp`. When there are multiple observations with the highest or lowest temperatures, the ones with the lowest RH are selected.
4. Matches SILO temperature and RH values to the field observations in `df_max_temp` and `df_min_temp` based on the nearest SILO grid cell and `silo_observation_date`.
5. For both `df_max_temp` and `df_min_temp`, calculates **VPD** (vapor pressure deficit) from the matched SILO temperature and RH.
6. Saves the combined field and SILO data as `silo_max_temp.csv` and `silo_min_temp.csv` in the `output/csv` folder.


⚠️ **Important notes**
* Before running the script, set all variables in the **first cell**, and delete the **second cell** if not using a Google Colab environment.  
  *(The script was developed for use in Google Colab and has not been tested outside of it.)*
* The **first and last observation dates** of each site are **disregarded** because there is no temperature data for the whole day for those dates.
* Obsevations with timestamps between 8.59 am of 1 April 2020 to 9.00 am of 2 April 202 have `silo_observation_date` as 2 April 2020.


In [1]:
working_dir = "/content/drive/My Drive/Work/2025.04 ANU Bushfire"
download_silo_data = False

In [2]:
# Connect to Google Drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Loading in-situ and remote data


In [3]:
# Load in-situ_topography.csv as the main df

import pandas as pd
import os

df = pd.read_csv(os.path.join(working_dir, "output/csv/in-situ_topography.csv"))
df['Datetime'] = pd.to_datetime(df['Datetime'])

first_date, day_after_last_date = min(df['Datetime']).strftime("%Y%m%d"), (max(df['Datetime']) + pd.Timedelta(days=1)).strftime("%Y%m%d")
print("First date: ", first_date, ", One day after the last date: ", day_after_last_date)

df.head()

First date:  20181221 , One day after the last date:  20201107


Unnamed: 0.1,Unnamed: 0,SiteID,X,Y,Datetime,Temperature,RH,VPD,slope,aspect,relief
0,0,79,150.2953,-35.48254,2019-01-20 04:00:00,17.941,97.525473,0.050885,7.76997,187.29736,27.341133
1,1,79,150.2953,-35.48254,2019-01-20 05:00:00,17.753,97.881611,0.043049,7.76997,187.29736,27.341133
2,2,79,150.2953,-35.48254,2019-01-20 06:00:00,17.878,98.236778,0.036114,7.76997,187.29736,27.341133
3,3,79,150.2953,-35.48254,2019-01-20 07:00:00,18.066,97.406114,0.05376,7.76997,187.29736,27.341133
4,4,79,150.2953,-35.48254,2019-01-20 08:00:00,18.379,97.881611,0.044776,7.76997,187.29736,27.341133


In [4]:
# Load silo data

def nearest_silo_cell(x, y):
    return (round(round(x / 0.05) * 0.05, 3), round(round(y / 0.05) * 0.05, 3))

# Make a list of locations of all sites and reduce it to silo grid cell locations
df['silo_X'] = df.apply(lambda row: nearest_silo_cell(row['X'], row['Y'])[0], axis=1)
df['silo_Y'] = df.apply(lambda row: nearest_silo_cell(row['X'], row['Y'])[1], axis=1)
silo_cell_locations_list = list(set((x, y) for x, y in df[['silo_X', 'silo_Y']].values))
print("silo_cell_locations_list: ", silo_cell_locations_list)

# Use Python loop and curl to download all silo data
silo_data_dir = os.path.join(working_dir, "Data/silo")
if download_silo_data:
    os.makedirs(silo_data_dir, exist_ok=True)
    for x, y in silo_cell_locations_list:
        output_file_path = os.path.join(silo_data_dir, f"silo_data_{x}_{y}.csv")
        url = f"https://www.longpaddock.qld.gov.au/cgi-bin/silo/DataDrillDataset.php?lat={y}&lon={x}&format=csv&start={first_date}&finish={day_after_last_date}&username=noemail@net.com&dataset=Official&comment=xnhg"
        print(f"Downloading {url} to {output_file_path}")
        !curl -L "{url}" -o "{output_file_path}" -C -

silo_cell_locations_list:  [(np.float64(150.15), np.float64(-35.75)), (np.float64(150.05), np.float64(-36.15)), (np.float64(150.2), np.float64(-35.75)), (np.float64(150.3), np.float64(-35.45)), (np.float64(150.1), np.float64(-35.55)), (np.float64(150.35), np.float64(-35.45)), (np.float64(150.25), np.float64(-35.6)), (np.float64(150.15), np.float64(-35.8)), (np.float64(150.15), np.float64(-35.55)), (np.float64(150.3), np.float64(-35.5)), (np.float64(150.1), np.float64(-35.8)), (np.float64(150.25), np.float64(-35.55)), (np.float64(150.2), np.float64(-35.65))]


# Combining in-situ and remote data into a single dataframe

In [5]:
# Find in-situ data's hottest and coldest observation of each day

# Assign silo_observation_date to each observation
mask = (df['Datetime'].dt.hour > 9) | ((df['Datetime'].dt.hour == 9) & (df['Datetime'].dt.minute > 0))
df.loc[mask, 'silo_observation_date'] = df.loc[mask, 'Datetime'] + pd.Timedelta(days=1)
df.loc[~mask, 'silo_observation_date'] = df.loc[~mask, 'Datetime']
df['silo_observation_date'] = pd.to_datetime(df['silo_observation_date']).dt.date

# Remove rows of the first and last dates of each site because those days lack some hours
first_last_dates = df.groupby('SiteID')['silo_observation_date'].agg(['min', 'max']).reset_index()
df = df.merge(first_last_dates, on='SiteID', how='left')
df_date_filtered = df[(df['silo_observation_date'] != df['min']) & (df['silo_observation_date'] != df['max'])]
df_date_filtered = df_date_filtered.drop(columns=['min', 'max'])

# Select only rows with max tempurature of the day
df_date_filtered['Day_max_temp'] = df_date_filtered.groupby(['SiteID', 'silo_observation_date'])['Temperature'].transform('max')
df_max_temp = df_date_filtered[df_date_filtered['Temperature'] == df_date_filtered['Day_max_temp']]
print("In-situ day maximum temperature observations:")
print(df_max_temp.head())
print('\n')

# Select only rows with min tempurature of the day
df_date_filtered['Day_min_temp'] = df_date_filtered.groupby(['SiteID', 'silo_observation_date'])['Temperature'].transform('min')
df_min_temp = df_date_filtered[df_date_filtered['Temperature'] == df_date_filtered['Day_min_temp']]
print("In-situ day minimum temperature observations:")
df_min_temp.head()

# -------------------------------------- Investigation code below --------------------------------------
# df_max_temp[df_max_temp['Datetime'].dt.hour < 9]

# import datetime
# df[(df['SiteID'] == 79) & (df['Date'] == datetime.date(2019, 1, 30))]
# df[(df['SiteID'] == 90) & (df['Date'] == datetime.date(2019, 2, 19))]  ## Hottest in the morning
# df[(df['SiteID'] == 90) & (df['Date'] == datetime.date(2019, 3, 6))]
# df[(df['SiteID'] == 142) & (df['Date'] == datetime.date(2019, 3, 27))]
# df[(df['SiteID'] == 78) & (df['Date'] == datetime.date(2019, 1, 19))]

# df_date_filtered.groupby(['SiteID', 'Date'])['Datetime'].apply(lambda x: x.nunique())

In-situ day maximum temperature observations:
     Unnamed: 0  SiteID         X         Y            Datetime  Temperature  \
10           10      79  150.2953 -35.48254 2019-01-20 14:00:00       23.820   
34           34      79  150.2953 -35.48254 2019-01-21 14:00:00       27.320   
59           59      79  150.2953 -35.48254 2019-01-22 15:00:00       32.877   
81           81      79  150.2953 -35.48254 2019-01-23 13:00:00       24.383   
109         109      79  150.2953 -35.48254 2019-01-24 17:00:00       22.632   

            RH       VPD    slope     aspect     relief  silo_X  silo_Y  \
10   80.290150  0.581801  7.76997  187.29736  27.341133   150.3   -35.5   
34   69.941776  1.091962  7.76997  187.29736  27.341133   150.3   -35.5   
59   57.327511  2.131724  7.76997  187.29736  27.341133   150.3   -35.5   
81   69.113052  0.943042  7.76997  187.29736  27.341133   150.3   -35.5   
109  81.075206  0.519960  7.76997  187.29736  27.341133   150.3   -35.5   

    silo_observation_d

Unnamed: 0.1,Unnamed: 0,SiteID,X,Y,Datetime,Temperature,RH,VPD,slope,aspect,relief,silo_X,silo_Y,silo_observation_date,Day_max_temp,Day_min_temp
1,1,79,150.2953,-35.48254,2019-01-20 05:00:00,17.753,97.881611,0.043049,7.76997,187.29736,27.341133,150.3,-35.5,2019-01-20,20.318,17.753
24,24,79,150.2953,-35.48254,2019-01-21 04:00:00,19.443,95.853469,0.093664,7.76997,187.29736,27.341133,150.3,-35.5,2019-01-21,23.82,19.443
51,51,79,150.2953,-35.48254,2019-01-22 07:00:00,19.318,95.733139,0.095636,7.76997,187.29736,27.341133,150.3,-35.5,2019-01-22,27.32,19.318
74,74,79,150.2953,-35.48254,2019-01-23 06:00:00,21.632,97.049976,0.076264,7.76997,187.29736,27.341133,150.3,-35.5,2019-01-23,32.877,21.632
97,97,79,150.2953,-35.48254,2019-01-24 05:00:00,19.13,97.287724,0.060084,7.76997,187.29736,27.341133,150.3,-35.5,2019-01-24,24.383,19.13


In [6]:
# Use only one observation per sit per day

df_max_temp = df_max_temp.loc[
    df_max_temp.groupby(['SiteID', 'silo_observation_date'])['RH'].idxmin()
]
df_min_temp = df_min_temp.loc[
    df_min_temp.groupby(['SiteID', 'silo_observation_date'])['RH'].idxmin()
]
df_max_temp.head()

# Investigating the number of obs per site per day
# df_max_temp.groupby(['SiteID', 'silo_observation_date'])['Datetime'].apply(lambda x: x.nunique())
# Check the accuracy of the code
# import datetime
# df_max_temp[(df_max_temp['SiteID'] == 264) & (df_max_temp['silo_observation_date'] == datetime.date(2020, 3, 21))]

Unnamed: 0.1,Unnamed: 0,SiteID,X,Y,Datetime,Temperature,RH,VPD,slope,aspect,relief,silo_X,silo_Y,silo_observation_date,Day_max_temp
7235,7235,67,150.2697,-35.54484,2018-12-22 12:15:00,25.369,47.004939,1.71602,9.808307,91.47359,63.235474,150.25,-35.55,2018-12-23,25.369
7259,7259,67,150.2697,-35.54484,2018-12-23 12:15:00,26.119,43.246507,1.921196,9.808307,91.47359,63.235474,150.25,-35.55,2018-12-24,26.119
7282,7282,67,150.2697,-35.54484,2018-12-24 11:15:00,30.367,39.742369,2.611086,9.808307,91.47359,63.235474,150.25,-35.55,2018-12-25,30.367
7306,7306,67,150.2697,-35.54484,2018-12-25 11:15:00,38.476,22.215786,5.287134,9.808307,91.47359,63.235474,150.25,-35.55,2018-12-26,38.476
7330,7330,67,150.2697,-35.54484,2018-12-26 11:15:00,32.239,47.303414,2.539643,9.808307,91.47359,63.235474,150.25,-35.55,2018-12-27,32.239


In [7]:
# Fill in df with silo data at the hottest time of the day (max_temp, RH_tmax)

# For each row, open silo data csv file one-by-one to get data
def get_silo_value(row, column):
    file_path = os.path.join(silo_data_dir, f"silo_data_{row['silo_X']}_{row['silo_Y']}.csv")
    df_silo = pd.read_csv(file_path)

    df_silo['Datetime'] = pd.to_datetime(df_silo['YYYY-MM-DD'], format='%Y-%m-%d')
    target_date = row['silo_observation_date']
    silo_value = df_silo.loc[df_silo['Datetime'].dt.date == target_date, column].values[0]

    return silo_value

df_max_temp['silo_Temperature'] = df_max_temp.apply(lambda row: get_silo_value(row, 'max_temp'), axis=1)
df_max_temp['silo_RH'] = df_max_temp.apply(lambda row: get_silo_value(row, 'rh_tmax'), axis=1)
df_max_temp.head()

Unnamed: 0.1,Unnamed: 0,SiteID,X,Y,Datetime,Temperature,RH,VPD,slope,aspect,relief,silo_X,silo_Y,silo_observation_date,Day_max_temp,silo_Temperature,silo_RH
7235,7235,67,150.2697,-35.54484,2018-12-22 12:15:00,25.369,47.004939,1.71602,9.808307,91.47359,63.235474,150.25,-35.55,2018-12-23,25.369,22.3,58.3
7259,7259,67,150.2697,-35.54484,2018-12-23 12:15:00,26.119,43.246507,1.921196,9.808307,91.47359,63.235474,150.25,-35.55,2018-12-24,26.119,24.5,50.1
7282,7282,67,150.2697,-35.54484,2018-12-24 11:15:00,30.367,39.742369,2.611086,9.808307,91.47359,63.235474,150.25,-35.55,2018-12-25,30.367,26.6,52.6
7306,7306,67,150.2697,-35.54484,2018-12-25 11:15:00,38.476,22.215786,5.287134,9.808307,91.47359,63.235474,150.25,-35.55,2018-12-26,38.476,26.9,60.7
7330,7330,67,150.2697,-35.54484,2018-12-26 11:15:00,32.239,47.303414,2.539643,9.808307,91.47359,63.235474,150.25,-35.55,2018-12-27,32.239,30.0,54.4


In [8]:
# Fill in df with silo data at the coldest time of the day (min_temp, RH_tmin)

df_min_temp['silo_Temperature'] = df_min_temp.apply(lambda row: get_silo_value(row, 'min_temp'), axis=1)
df_min_temp['silo_RH'] = df_min_temp.apply(lambda row: get_silo_value(row, 'rh_tmin'), axis=1)
df_min_temp.head()

Unnamed: 0.1,Unnamed: 0,SiteID,X,Y,Datetime,Temperature,RH,VPD,slope,aspect,relief,silo_X,silo_Y,silo_observation_date,Day_max_temp,Day_min_temp,silo_Temperature,silo_RH
7251,7251,67,150.2697,-35.54484,2018-12-23 04:15:00,11.724,93.356963,0.091491,9.808307,91.47359,63.235474,150.25,-35.55,2018-12-23,25.369,11.724,11.9,100.0
7275,7275,67,150.2697,-35.54484,2018-12-24 04:15:00,11.41,96.592081,0.04597,9.808307,91.47359,63.235474,150.25,-35.55,2018-12-24,26.119,11.41,11.4,100.0
7300,7300,67,150.2697,-35.54484,2018-12-25 05:15:00,14.481,96.592081,0.056203,9.808307,91.47359,63.235474,150.25,-35.55,2018-12-25,30.367,14.481,15.4,100.0
7320,7320,67,150.2697,-35.54484,2018-12-26 01:15:00,18.176,95.519926,0.093496,9.808307,91.47359,63.235474,150.25,-35.55,2018-12-26,38.476,18.176,17.3,100.0
7348,7348,67,150.2697,-35.54484,2018-12-27 05:15:00,18.113,97.893001,0.043798,9.808307,91.47359,63.235474,150.25,-35.55,2018-12-27,32.239,18.113,17.3,100.0


In [9]:
# Investigate null values. Site 251 is located in the ocean.
print("df_max_temp length: ", len(df_max_temp))
print("df_min_temp length: ", len(df_min_temp))
df_max_temp[(df_max_temp.isna().any(axis=1)) & (df_max_temp['SiteID'] != 251)]
df_min_temp[(df_min_temp.isna().any(axis=1)) & (df_min_temp['SiteID'] != 251)]

df_max_temp length:  1047
df_min_temp length:  1047


Unnamed: 0.1,Unnamed: 0,SiteID,X,Y,Datetime,Temperature,RH,VPD,slope,aspect,relief,silo_X,silo_Y,silo_observation_date,Day_max_temp,Day_min_temp,silo_Temperature,silo_RH


# Calculating remote VPD from remote temperature and remote relative humidity

In [10]:
# Write a function for deriving VPD from temp and RH

import math
import numpy as np

def calculate_vpd(temp, rh):
    if pd.isna(temp) or pd.isna(rh):
        return np.nan
    es = 0.6108 * math.exp(17.27 * temp / (237.3 + temp))
    e = es * rh / 100
    vpd = es - e
    return vpd

df_max_temp['silo_VPD'] = df_max_temp.apply(lambda row: calculate_vpd(row['silo_Temperature'], row['silo_RH']), axis=1)
df_min_temp['silo_VPD'] = df_min_temp.apply(lambda row: calculate_vpd(row['silo_Temperature'], row['silo_RH']), axis=1)
df_max_temp.head()

Unnamed: 0.1,Unnamed: 0,SiteID,X,Y,Datetime,Temperature,RH,VPD,slope,aspect,relief,silo_X,silo_Y,silo_observation_date,Day_max_temp,silo_Temperature,silo_RH,silo_VPD
7235,7235,67,150.2697,-35.54484,2018-12-22 12:15:00,25.369,47.004939,1.71602,9.808307,91.47359,63.235474,150.25,-35.55,2018-12-23,25.369,22.3,58.3,1.122841
7259,7259,67,150.2697,-35.54484,2018-12-23 12:15:00,26.119,43.246507,1.921196,9.808307,91.47359,63.235474,150.25,-35.55,2018-12-24,26.119,24.5,50.1,1.53425
7282,7282,67,150.2697,-35.54484,2018-12-24 11:15:00,30.367,39.742369,2.611086,9.808307,91.47359,63.235474,150.25,-35.55,2018-12-25,30.367,26.6,52.6,1.650716
7306,7306,67,150.2697,-35.54484,2018-12-25 11:15:00,38.476,22.215786,5.287134,9.808307,91.47359,63.235474,150.25,-35.55,2018-12-26,38.476,26.9,60.7,1.392979
7330,7330,67,150.2697,-35.54484,2018-12-26 11:15:00,32.239,47.303414,2.539643,9.808307,91.47359,63.235474,150.25,-35.55,2018-12-27,32.239,30.0,54.4,1.934838


# Save the resulting dataframes

In [16]:
# Drop helper columns
df_max_temp.drop(columns=['Day_max_temp'], inplace=True)
df_min_temp.drop(columns=['Day_max_temp', 'Day_min_temp'], inplace=True)

KeyError: "['Day_max_temp'] not found in axis"

In [17]:
df_max_temp.to_csv(os.path.join(working_dir, "output/csv/silo_max_temp.csv"), index=False)
df_min_temp.to_csv(os.path.join(working_dir, "output/csv/silo_min_temp.csv"), index=False)