# Ground Truth Builder
This script takes in the ground truth measurements for all populations and builds an hourly data file in the same format as the GARISOM output to allow for easy comparison between the files.

Variables:
- GW, stomatal conductance
- P-PD and P-MD, preddawn and midday pressures
- E, transpiration estimated from GW
- leaftemp (added separately in leaftemp-data_transform.ipynb)

K-plant is no longer included since the pressure and stomatal conductance data was collected on different days.

All data except for leaftemp was collected during 7 total measurement periods over the course of the study, four pre-drought, one drought, and 2 post-drought. Some of these measurement periods contain AM and PM data, some don't.

In [193]:
import pandas as pd
from datetime import datetime
import numpy as np
from collections import defaultdict

## Hourly weather data

Get the hourly weather data, so everything can be aligned properly to the model outputs.

In [194]:
hourly_weather = pd.read_csv("../DBG/dataset.csv")[:-1]
hourly_weather['year'] = hourly_weather['Year'].astype(int).astype(str)
hourly_weather['julian-day'] = hourly_weather['Day'].astype(int).astype(str)
hourly_weather['standard-time'] = hourly_weather['Hour'].astype(int).astype(str)
hourly_weather = hourly_weather.drop(['Year', 'Day', 'Hour'], axis=1)
hourly_weather = hourly_weather.set_index(['year', 'julian-day', 'standard-time'])
hourly_weather.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Solar_Wm2,Rain_mm,Wind_ms.1,Tair_C,Tsoil_C,D_kPa
year,julian-day,standard-time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023,201,0,0.0,6.57,1.01,36.498,34.13,4.600884
2023,201,1,0.0,0.25,0.5,34.624,32.68,4.072462
2023,201,2,0.0,0.25,0.0,30.874,31.31,3.128441
2023,201,3,0.0,0.25,0.0,30.925,30.04,3.083842
2023,201,4,0.0,0.25,0.0,30.293,28.91,2.853573


In [195]:
# Treatments and their corresponding start and end julian-days
treatments = { 
    "Pre-water stress": (201, 237),
    "Water stress": (238, 241),
    "Post-water stress": (242, 265),
}

## GW
GW data contains measurements twice a day during each measurement period.

Data was collected for 3 leaves per replicate, each with 3 measurements. To estimate the actual GSW per replicate, we average over the leaves.

Our experiments are done at the population level, so similarly, all data is averaged for each period across every genotype for a single population.

GW is stored in mol m-2 s-1, so we multiply by 1000 to get mmol m-2 s-1.

In [196]:
gw = pd.read_csv("porometer.all.days.csv")
gw.head()

Unnamed: 0,pot,leaf,population,pop.short,pop.geno,elevation,genotype,rep,observation,am.pm,...,treatment.levels,gsw,rh,t.ref,t.leaf,delta.t,svp,lsvp,vpd,vpd.kpa
0,1,1,NRV-NEW,NRV,NRV101,666,101,4,172,am,...,predrought,0.21,21.31,40.59,34.34,6.25,7609.8,5420.2,3798.55,3.8
1,1,1,NRV-NEW,NRV,NRV101,666,101,4,173,am,...,predrought,0.342,21.42,40.6,35.17,5.43,7613.83,5675.1,4044.22,4.04
2,1,2,NRV-NEW,NRV,NRV101,666,101,4,174,am,...,predrought,0.158,20.51,40.61,36.7,3.91,7617.88,6172.31,4609.88,4.61
3,1,2,NRV-NEW,NRV,NRV101,666,101,4,175,am,...,predrought,0.239,20.13,40.64,36.36,4.28,7630.01,6058.67,4522.75,4.52
4,1,3,NRV-NEW,NRV,NRV101,666,101,4,176,am,...,predrought,0.181,19.79,40.66,36.35,4.31,7638.11,6055.36,4543.78,4.54


In [197]:
gw = gw[["population", "am.pm", "time", "date", "gsw", "rh", "t.ref", "t.leaf"]]

In [198]:
gw['gsw'] *= 1000

In [199]:
# Rename population values in all relevant DataFrames
pop_rename_dict = {
    'CCR-COL': 'ccr',
    'JLA-JAK': 'jla',
    'NRV-NEW': 'nrv',
    'TSZ-SAN': 'tsz',
}
gw['population'] = gw['population'].apply(lambda x: pop_rename_dict[x])

In [200]:
gw['date'] = pd.to_datetime(gw['date'], format="%m/%d/%y")
gw['time'] = pd.to_datetime(gw['time'], format="%H:%M:%S")

### Calculate transpiration

We can derive E from the stomatal conductance data.

In [201]:
def calc_e_water(T):
    '''
    Calculate saturation vapor pressure for water based on Buck (1996).

    Returns in kPa
    '''
    return 0.61121 * np.exp((18.678 - (T / 234.5)) * (T / (257.14 + T)))

In [202]:
elevation = 380  # elevation in meters at DBG

def calc_transpiration(row):
    """
    E = gsw * (VPleaf - VPair) / Atm pressure
    """
    rh = row['rh']          # relative humidity (%)
    gsw = row['gsw']        # stomatal conductance mmol m-2 s-1
    tair = row['t.ref']     # air temperature C
    tleaf = row['t.leaf']   # leaf temperature C

    # Calculate leaf saturation vapor pressure at leaf temperature (kPa)
    vpleaf = row['vpleaf'] = calc_e_water(tleaf)

    # Calculate air vapor pressure (kPa)
    vpair = row['vpair'] = calc_e_water(tair) * rh/100

    # Calculate atmospheric pressure at elevation and temperature
    # Atmospheric pressure (kPa) using barometric formula
    # Calculate atmospheric pressure at elevation and temperature (hypsometric equation)
    # T must be in Kelvin
    patm = 101.325 * np.exp(-elevation / (29.27 * (tair + 273.15)))

    # Transpiration rate (mmol m-2 s-1)
    # E = gsw * (vpleaf - vpair) / patm
    row['e'] = gsw * ((vpleaf - vpair) / patm)

    return row

In [203]:
# Calculate transpiration
gw = gw.apply(calc_transpiration, axis=1)
gw = gw.drop(columns=['rh', 't.ref', 't.leaf', 'vpleaf', 'vpair'])

In [204]:
gw

Unnamed: 0,population,am.pm,time,date,gsw,e
0,nrv,am,1900-01-01 08:46:00,2023-07-20,210.0,8.210969
1,nrv,am,1900-01-01 08:47:00,2023-07-20,342.0,14.237808
2,nrv,am,1900-01-01 08:47:00,2023-07-20,158.0,7.498486
3,nrv,am,1900-01-01 08:48:00,2023-07-20,239.0,11.127999
4,nrv,am,1900-01-01 08:48:00,2023-07-20,181.0,8.466637
...,...,...,...,...,...,...
4713,ccr,am,1900-01-01 08:31:16,2023-09-22,115.0,2.138685
4714,ccr,am,1900-01-01 08:31:32,2023-09-22,7.0,0.122529
4715,ccr,am,1900-01-01 08:31:42,2023-09-22,264.0,4.851376
4716,ccr,am,1900-01-01 08:32:05,2023-09-22,151.0,2.589536


In [None]:
for pop in gw['population'].unique():
    # Only keep rows where time is between 15:00 and 17:00 (inclusive)
    pop_gw = gw[(gw['population'] == pop) & 
                (gw['time'].dt.hour >= 15) & 
                ((gw['time'].dt.hour <= 17))
               ].copy()
    pop_gw['julian-day'] = pop_gw['date'].dt.strftime('%-j').astype(int)
    results = []
    for treatment, (start_jd, end_jd) in treatments.items():
        mask = (pop_gw['julian-day'] >= start_jd) & (pop_gw['julian-day'] <= end_jd)
        period_data = pop_gw[mask]
        avg = period_data[['gsw', 'e']].mean()
        stderr = period_data[['gsw', 'e']].std() / (len(period_data) ** 0.5)
        results.append({
            'treatment': treatment,
            'GW_avg': avg['gsw'],
            'GW_stderr': stderr['gsw'],
            'E-MD_avg': avg['e'],
            'E-MD_stderr': stderr['e']
        })
    pd.DataFrame(results).to_csv(f'./ground/{pop}_gw_treatment_averages_pm.csv', index=False)

### Group everything and get averages, time variance, and stderr

In [206]:
gw_grouped = gw.groupby(["population", "am.pm", "date"])

In [207]:
# Get time descriptions so we know the min and max data collection time for each point.
gw_time_desc = gw_grouped['time'].describe().reset_index()
gw_time_desc

Unnamed: 0,population,am.pm,date,count,mean,min,25%,50%,75%,max
0,ccr,am,2023-07-20,30,1900-01-01 08:01:50.000000256,1900-01-01 07:32:00,1900-01-01 07:41:15,1900-01-01 08:06:30,1900-01-01 08:13:00,1900-01-01 08:37:00
1,ccr,am,2023-07-21,66,1900-01-01 08:05:11.818181888,1900-01-01 07:37:00,1900-01-01 07:45:00,1900-01-01 08:03:30,1900-01-01 08:20:00,1900-01-01 08:45:00
2,ccr,am,2023-08-09,36,1900-01-01 08:03:14.083333120,1900-01-01 07:44:32,1900-01-01 07:48:01.249999872,1900-01-01 07:58:44.500000,1900-01-01 08:13:33.249999872,1900-01-01 08:35:33
3,ccr,am,2023-08-10,54,1900-01-01 08:17:09.481481728,1900-01-01 07:36:50,1900-01-01 07:45:23,1900-01-01 08:14:31.500000,1900-01-01 08:35:27,1900-01-01 09:42:51
4,ccr,am,2023-08-16,54,1900-01-01 08:00:35.259258880,1900-01-01 07:33:49,1900-01-01 07:40:44,1900-01-01 07:57:38,1900-01-01 08:16:13,1900-01-01 08:34:28
...,...,...,...,...,...,...,...,...,...,...
91,tsz,pm,2023-08-24,36,1900-01-01 15:38:59.416666624,1900-01-01 15:18:57,1900-01-01 15:30:14.249999872,1900-01-01 15:36:54,1900-01-01 15:53:44.249999872,1900-01-01 15:56:12
92,tsz,pm,2023-08-28,90,1900-01-01 16:45:49.399999744,1900-01-01 15:50:01,1900-01-01 16:15:06,1900-01-01 16:39:26,1900-01-01 17:21:27.249999872,1900-01-01 17:41:13
93,tsz,pm,2023-09-09,90,1900-01-01 15:57:31.966666752,1900-01-01 15:21:21,1900-01-01 15:35:23.750000128,1900-01-01 16:01:14,1900-01-01 16:13:35.500000,1900-01-01 16:33:05
94,tsz,pm,2023-09-20,54,1900-01-01 15:56:37.425926400,1900-01-01 15:29:48,1900-01-01 15:46:53.249999872,1900-01-01 16:05:13.500000,1900-01-01 16:09:47.249999872,1900-01-01 16:14:09


In [208]:
def to_time(val):
    return pd.to_datetime(val).time()

am_rows = gw_time_desc[gw_time_desc['am.pm'] == 'am'].copy()
am_rows['max_time_obj'] = am_rows['max'].apply(to_time)

# 12pm as a time object
noon = datetime.strptime('12:00:00', '%H:%M:%S').time()

# Check if any am row has max_time >= 12pm
invalid_am = am_rows[am_rows['max_time_obj'] >= noon]
assert invalid_am.size == 0

In [209]:
# Check that any row with 'pm' has min_time after 12pm
pm_rows = gw_time_desc[gw_time_desc['am.pm'] == 'pm'].copy()
pm_rows['min_time_obj'] = pm_rows['min'].apply(lambda x: pd.to_datetime(x).time())
invalid_pm = pm_rows[pm_rows['min_time_obj'] < noon]
assert invalid_pm.size == 0

In [210]:
gw_clean = gw_grouped.mean().reset_index()
gw_clean['datetime'] = pd.to_datetime(gw_clean['date'].dt.strftime('%m/%d/%y') + ' ' + gw_clean['time'].dt.strftime('%H:%M:%S'), format="%m/%d/%y %H:%M:%S")
gw_clean = gw_clean.drop(columns=['date', 'time'])
gw_clean

Unnamed: 0,population,am.pm,gsw,e,datetime
0,ccr,am,270.800000,9.494919,2023-07-20 08:01:50
1,ccr,am,274.196970,7.849517,2023-07-21 08:05:11
2,ccr,am,365.000000,8.873387,2023-08-09 08:03:14
3,ccr,am,278.481481,7.734187,2023-08-10 08:17:09
4,ccr,am,312.444444,7.582487,2023-08-16 08:00:35
...,...,...,...,...,...
91,tsz,pm,119.305556,6.104042,2023-08-24 15:38:59
92,tsz,pm,14.055556,1.248760,2023-08-28 16:45:49
93,tsz,pm,51.466667,3.984540,2023-09-09 15:57:31
94,tsz,pm,95.981481,4.600917,2023-09-20 15:56:37


In [211]:
# Get std_err across groupings
# Calculate standard error of the mean (std_err) for gsw in each group
gw_std_err = gw_grouped[['gsw', 'e']].std().reset_index()
gw_std_err['count'] = gw_grouped['gsw'].count().values
gw_std_err['e'] = gw_std_err['e'] / (gw_std_err['count'] ** 0.5)
gw_std_err['gsw'] = gw_std_err['gsw'] / (gw_std_err['count'] ** 0.5)
gw_std_err = gw_std_err.drop(columns=['count', 'date'])
gw_std_err['datetime'] = gw_clean['datetime']
gw_std_err

Unnamed: 0,population,am.pm,gsw,e,datetime
0,ccr,am,11.864309,0.520121,2023-07-20 08:01:50
1,ccr,am,13.769755,0.340581,2023-07-21 08:05:11
2,ccr,am,15.928511,0.388277,2023-08-09 08:03:14
3,ccr,am,15.044582,0.414667,2023-08-10 08:17:09
4,ccr,am,17.779154,0.554289,2023-08-16 08:00:35
...,...,...,...,...,...
91,tsz,pm,9.969542,0.463341,2023-08-24 15:38:59
92,tsz,pm,1.460111,0.111024,2023-08-28 16:45:49
93,tsz,pm,4.375000,0.314294,2023-09-09 15:57:31
94,tsz,pm,9.785059,0.421108,2023-09-20 15:56:37


In [212]:
# Add year, julian-day, and hour to data sheets for easy concatenation with the hourly weather file
def add_time_indices(df):
    df['year'] = df['datetime'].apply(lambda x: x.strftime("%Y")).astype(int).astype(str)
    df['julian-day'] = df['datetime'].apply(lambda x: x.strftime("%-j")).astype(int).astype(str)
    # If minute >= 30, round up to next hour, else keep as is
    def round_hour(dt):
        hour = dt.hour
        if dt.minute >= 30:
            hour += 1
        return str(hour)
    df['standard-time'] = df['datetime'].apply(round_hour)

    df = df.drop(columns=['datetime'])

    return df

In [213]:
gw_clean = add_time_indices(gw_clean)
gw_std_err = add_time_indices(gw_std_err)

In [214]:
gw_clean

Unnamed: 0,population,am.pm,gsw,e,year,julian-day,standard-time
0,ccr,am,270.800000,9.494919,2023,201,8
1,ccr,am,274.196970,7.849517,2023,202,8
2,ccr,am,365.000000,8.873387,2023,221,8
3,ccr,am,278.481481,7.734187,2023,222,8
4,ccr,am,312.444444,7.582487,2023,228,8
...,...,...,...,...,...,...,...
91,tsz,pm,119.305556,6.104042,2023,236,16
92,tsz,pm,14.055556,1.248760,2023,240,17
93,tsz,pm,51.466667,3.984540,2023,252,16
94,tsz,pm,95.981481,4.600917,2023,263,16


In [215]:
gw_std_err

Unnamed: 0,population,am.pm,gsw,e,year,julian-day,standard-time
0,ccr,am,11.864309,0.520121,2023,201,8
1,ccr,am,13.769755,0.340581,2023,202,8
2,ccr,am,15.928511,0.388277,2023,221,8
3,ccr,am,15.044582,0.414667,2023,222,8
4,ccr,am,17.779154,0.554289,2023,228,8
...,...,...,...,...,...,...,...
91,tsz,pm,9.969542,0.463341,2023,236,16
92,tsz,pm,1.460111,0.111024,2023,240,17
93,tsz,pm,4.375000,0.314294,2023,252,16
94,tsz,pm,9.785059,0.421108,2023,263,16


## P-PD and P-MD

Pressures were collected at midday and preddawn for every measurement period, one measurement per pot.

Measurement periods for pressures lag behind the stomatal conductance measurements by 4-5 days.

In [216]:
p = pd.read_csv("dbg_cottonwood_waterpotential.csv")
p.head()

Unnamed: 0,date,interval,treatment,meas.week,week.of.year,pot,water.potential,time,time.of.day,pop.geno,elevation,population,genotype,chamber
0,230725,predrought1,predrought,1,29,1,-2.36,15:06,midday,NRV101,666,NRV-NEW,101,Ogle
1,230725,predrought1,predrought,1,29,2,-2.36,15:25,midday,NRV101,666,NRV-NEW,101,Hultine
2,230725,predrought1,predrought,1,29,3,-1.81,15:22,midday,NRV101,666,NRV-NEW,101,Hultine
3,230725,predrought1,predrought,1,29,7,-2.17,16:16,midday,NRV105,666,NRV-NEW,105,Ogle
4,230725,predrought1,predrought,1,29,8,-2.28,15:36,midday,NRV105,666,NRV-NEW,105,Hultine


In [217]:
p = p[["date", "water.potential", "time", "time.of.day", "population"]]

In [218]:
p['population'] = p['population'].apply(lambda x: pop_rename_dict[x])

In [219]:
# Make in terms of -MPa
p['water.potential'] = -p['water.potential']

In [220]:
p['date'] = pd.to_datetime(p['date'], format="%y%m%d")
p['time'] = pd.to_datetime(p['time'], format="%H:%M")

In [221]:
p['am.pm'] = p['time.of.day'].apply(lambda x: 'am' if x == 'predawn' else 'pm')
p = p.drop(columns=['time.of.day'])

In [222]:
p_grouped = p.groupby(['date', 'population', 'am.pm'])

In [223]:
p_time_desc = p_grouped['time'].describe().reset_index()
p_time_desc

Unnamed: 0,date,population,am.pm,count,mean,min,25%,50%,75%,max
0,2023-07-25,ccr,pm,15,1900-01-01 14:54:59.999999488,1900-01-01 13:37:00,1900-01-01 14:20:00,1900-01-01 14:49:00,1900-01-01 15:29:00,1900-01-01 16:18:00
1,2023-07-25,jla,pm,15,1900-01-01 14:54:16,1900-01-01 13:34:00,1900-01-01 14:01:00,1900-01-01 14:33:00,1900-01-01 15:46:00,1900-01-01 16:56:00
2,2023-07-25,nrv,pm,15,1900-01-01 15:11:48.000000512,1900-01-01 13:47:00,1900-01-01 14:27:30,1900-01-01 15:18:00,1900-01-01 15:40:30,1900-01-01 16:37:00
3,2023-07-25,tsz,pm,15,1900-01-01 14:49:23.999999488,1900-01-01 13:43:00,1900-01-01 14:14:00,1900-01-01 15:02:00,1900-01-01 15:19:00,1900-01-01 15:41:00
4,2023-07-26,ccr,am,15,1900-01-01 03:39:16.000000256,1900-01-01 02:25:00,1900-01-01 03:12:30,1900-01-01 03:47:00,1900-01-01 04:08:00,1900-01-01 04:30:00
5,2023-07-26,jla,am,15,1900-01-01 03:28:19.999999488,1900-01-01 02:36:00,1900-01-01 02:56:00,1900-01-01 03:17:00,1900-01-01 04:11:00,1900-01-01 04:27:00
6,2023-07-26,nrv,am,15,1900-01-01 03:43:52.000000256,1900-01-01 02:38:00,1900-01-01 03:20:30,1900-01-01 03:53:00,1900-01-01 04:05:00,1900-01-01 04:30:00
7,2023-07-26,tsz,am,15,1900-01-01 03:32:40,1900-01-01 02:33:00,1900-01-01 03:10:00,1900-01-01 03:39:00,1900-01-01 03:58:30,1900-01-01 04:21:00
8,2023-08-11,ccr,am,15,1900-01-01 03:37:15.999999744,1900-01-01 02:33:00,1900-01-01 03:15:00,1900-01-01 03:34:00,1900-01-01 04:09:30,1900-01-01 04:35:00
9,2023-08-11,ccr,pm,15,1900-01-01 15:00:08.000000256,1900-01-01 14:00:00,1900-01-01 14:24:30,1900-01-01 14:42:00,1900-01-01 15:51:30,1900-01-01 16:07:00


In [224]:
p_clean = p_grouped.mean().reset_index()
p_clean

Unnamed: 0,date,population,am.pm,water.potential,time
0,2023-07-25,ccr,pm,2.448667,1900-01-01 14:55:00.000000256
1,2023-07-25,jla,pm,2.084,1900-01-01 14:54:16.000000000
2,2023-07-25,nrv,pm,2.214667,1900-01-01 15:11:48.000000256
3,2023-07-25,tsz,pm,2.279333,1900-01-01 14:49:24.000000256
4,2023-07-26,ccr,am,0.652,1900-01-01 03:39:16.000000256
5,2023-07-26,jla,am,0.526,1900-01-01 03:28:20.000000256
6,2023-07-26,nrv,am,0.502667,1900-01-01 03:43:52.000000000
7,2023-07-26,tsz,am,0.536,1900-01-01 03:32:40.000000000
8,2023-08-11,ccr,am,0.627467,1900-01-01 03:37:15.999999744
9,2023-08-11,ccr,pm,2.078,1900-01-01 15:00:08.000000000


In [225]:
p_clean['datetime'] = p_clean['date'].dt.strftime("%Y/%m/%d") + p_clean['time'].dt.strftime(' %H:%M:%S')
p_clean['datetime'] = pd.to_datetime(p_clean['datetime'])
p_clean = p_clean.drop(columns=['date', 'time'])
p_clean

Unnamed: 0,population,am.pm,water.potential,datetime
0,ccr,pm,2.448667,2023-07-25 14:55:00
1,jla,pm,2.084,2023-07-25 14:54:16
2,nrv,pm,2.214667,2023-07-25 15:11:48
3,tsz,pm,2.279333,2023-07-25 14:49:24
4,ccr,am,0.652,2023-07-26 03:39:16
5,jla,am,0.526,2023-07-26 03:28:20
6,nrv,am,0.502667,2023-07-26 03:43:52
7,tsz,am,0.536,2023-07-26 03:32:40
8,ccr,am,0.627467,2023-08-11 03:37:15
9,ccr,pm,2.078,2023-08-11 15:00:08


In [226]:
p_std_err = p_grouped['water.potential'].std().reset_index()
p_std_err['count'] = p_grouped['water.potential'].count().values
p_std_err['water.potential'] = p_std_err['water.potential'] / (p_std_err['count'] ** 0.5)
p_std_err = p_std_err.drop(columns=['count', 'date'])
p_std_err['datetime'] = p_clean['datetime']
p_std_err

Unnamed: 0,population,am.pm,water.potential,datetime
0,ccr,pm,0.073697,2023-07-25 14:55:00
1,jla,pm,0.040916,2023-07-25 14:54:16
2,nrv,pm,0.058145,2023-07-25 15:11:48
3,tsz,pm,0.073287,2023-07-25 14:49:24
4,ccr,am,0.04562,2023-07-26 03:39:16
5,jla,am,0.057411,2023-07-26 03:28:20
6,nrv,am,0.04823,2023-07-26 03:43:52
7,tsz,am,0.046821,2023-07-26 03:32:40
8,ccr,am,0.03445,2023-08-11 03:37:15
9,ccr,pm,0.076527,2023-08-11 15:00:08


In [227]:
p_clean = add_time_indices(p_clean)
p_std_err = add_time_indices(p_std_err)

In [228]:
p_clean

Unnamed: 0,population,am.pm,water.potential,year,julian-day,standard-time
0,ccr,pm,2.448667,2023,206,15
1,jla,pm,2.084,2023,206,15
2,nrv,pm,2.214667,2023,206,15
3,tsz,pm,2.279333,2023,206,15
4,ccr,am,0.652,2023,207,4
5,jla,am,0.526,2023,207,3
6,nrv,am,0.502667,2023,207,4
7,tsz,am,0.536,2023,207,4
8,ccr,am,0.627467,2023,223,4
9,ccr,pm,2.078,2023,223,15


In [229]:
p_std_err

Unnamed: 0,population,am.pm,water.potential,year,julian-day,standard-time
0,ccr,pm,0.073697,2023,206,15
1,jla,pm,0.040916,2023,206,15
2,nrv,pm,0.058145,2023,206,15
3,tsz,pm,0.073287,2023,206,15
4,ccr,am,0.04562,2023,207,4
5,jla,am,0.057411,2023,207,3
6,nrv,am,0.04823,2023,207,4
7,tsz,am,0.046821,2023,207,4
8,ccr,am,0.03445,2023,223,4
9,ccr,pm,0.076527,2023,223,15


## Concatenate everything and expand ranges to cover data collection period variation

In [230]:
p_clean['P-PD'] = p_clean.loc[p_clean['am.pm'] == 'am', 'water.potential']
p_clean['P-MD'] = p_clean.loc[p_clean['am.pm'] == 'pm', 'water.potential']
p_clean = p_clean.drop(columns=['water.potential', 'am.pm'])

In [231]:
p_std_err['P-PD'] = p_std_err.loc[p_std_err['am.pm'] == 'am', 'water.potential']
p_std_err['P-MD'] = p_std_err.loc[p_std_err['am.pm'] == 'pm', 'water.potential']
p_std_err = p_std_err.drop(columns=['water.potential', 'am.pm'])

In [232]:
gw_clean = gw_clean.rename(
    columns={
        "gsw": 'GW',
        "e": "E-MD"
    }
)
gw_clean = gw_clean.drop(columns=['am.pm'])

In [233]:
gw_std_err = gw_std_err.rename(
    columns={
        "gsw": 'GW',
        "e": "E-MD"
    }
)
gw_std_err = gw_std_err.drop(columns=['am.pm'])

In [234]:
# This cell obtains and builds an accessible dictionary of the data collection time offsets
# There was different variation per variable and data collection point amongst all of the
# genotypes and leaf in a population

time_offsets = {}
# Helper to round time down/up to nearest 30 minutes
def round_time_to_30(dt, direction='down'):
    minute = dt.minute
    if direction == 'down':
        return dt.replace(minute=0 if minute < 30 else 30, second=0, microsecond=0)
    else:
        if minute == 0:
            return dt.replace(second=0, microsecond=0)
        elif minute <= 30:
            return dt.replace(minute=30, second=0, microsecond=0)
        else:
            dt = dt.replace(minute=0, second=0, microsecond=0) + pd.Timedelta(hours=1)
            return dt
        
def format_offsets_for_lookup(offsets):
    lookup = defaultdict(lambda: defaultdict(dict))
    for entry in offsets:
        pop = entry['population']
        date = entry['date']
        ampm = entry['am.pm']
        min_time = entry['min']
        max_time = entry['max']
        lookup[pop][date][ampm] = (min_time, max_time)
    return lookup

def extract_time_offsets(desc_df, min_col='min', max_col='max', group_cols=['population', 'am.pm', 'date']):
    offsets = []
    for _, row in desc_df.iterrows():
        try:
            min_time = pd.to_datetime(row[min_col])
            max_time = pd.to_datetime(row[max_col])
            min_rounded = round_time_to_30(min_time, 'down').strftime('%H:%M')
            max_rounded = round_time_to_30(max_time, 'up').strftime('%H:%M')
            group = {col: row[col] for col in group_cols if col in row}
            offsets.append({**group, 'min': min_rounded, 'max': max_rounded})
        except Exception:
            continue
    return format_offsets_for_lookup(offsets)

gw_offsets = extract_time_offsets(gw_time_desc)
p_pd_offsets = extract_time_offsets(p_time_desc[p_time_desc['am.pm'] == 'am'])
p_md_offsets = extract_time_offsets(p_time_desc[p_time_desc['am.pm'] == 'pm'])

time_offsets['GW'] = gw_offsets
time_offsets['E-MD'] = gw_offsets
time_offsets['P-PD'] = p_pd_offsets
time_offsets['P-MD'] = p_md_offsets

In [235]:
keep = ['population', 'year', 'julian-day', 'standard-time']

def expand_time_ranges(pop, df, time_offsets):
    expanded_rows = []
    for _, row in df.iterrows():
        for col in time_offsets.keys():
            date = pd.to_datetime(f"{row['year']}/{row['julian-day']}", format="%Y/%j")
            am_or_pm = 'am' if int(row['standard-time']) < 12 else 'pm'
            
            try:
                am_offset, pm_offset = time_offsets[col][pop][date][am_or_pm]
            except:
                continue

             # Convert 'HH:MM' to float hour
            def time_str_to_float(tstr):
                h, m = map(int, tstr.split(":"))
                return h + m / 60.0

            start = time_str_to_float(am_offset)
            end = time_str_to_float(pm_offset)

            for time in range(int(start // 1), int(end // 1) + 1):
                new_row = row.copy()
                for n_col in new_row.index:
                    if n_col not in keep and n_col != col:
                        new_row[n_col] = None
                new_row['standard-time'] = str(time)
                expanded_rows.append(new_row)

    return pd.DataFrame(expanded_rows).dropna(axis=1, how='all')

In [236]:
def create_pop_sum_dt(hourly_weather, pop_combined):
    sum_dt = (
        hourly_weather.reset_index()
        .merge(pop_combined, on=['year', 'julian-day', 'standard-time'], how='left')
    )

    return sum_dt

In [237]:
def get_pop_sum_dt(pop, combined_new_dt, time_offsets, expand=True):
    if expand:
        pop_combined_new_dt = expand_time_ranges(pop, combined_new_dt[combined_new_dt['population'] == pop], time_offsets).drop(columns=['population'])
    else:
        pop_combined_new_dt = combined_new_dt[combined_new_dt['population'] == pop].drop(columns=['population'])
    pop_combined_new_dt = pop_combined_new_dt.groupby(['year', 'julian-day', 'standard-time'], as_index=False).first()
    pop_sum_dt = create_pop_sum_dt(hourly_weather, pop_combined_new_dt)

    return pop_sum_dt

In [238]:
# Combine the pressures and stomatal conductance cleaned data and standard errors
combined_new_dt = gw_clean.merge(p_clean, on=['population', 'year', 'julian-day', 'standard-time'], how='outer')
combined_new_stderr = gw_std_err.merge(p_std_err, on=['population', 'year', 'julian-day', 'standard-time'], how='outer')

In [239]:
populations = ['ccr', 'jla', 'nrv', 'tsz']

In [240]:
for pop in populations:

    pop_sum_dt = get_pop_sum_dt(pop, combined_new_dt, time_offsets, expand=False)
    pop_sum_stderr = get_pop_sum_dt(pop, combined_new_stderr, time_offsets, expand=True)
    pop_sum_stderr.drop(columns=["standard-time", "Solar_Wm2", "Rain_mm", "Wind_ms.1", "Tair_C", "Tsoil_C", "D_kPa"])

    pop_sum_dt.to_csv(f"./ground/{pop}_hourly_data.csv", index=False)
    pop_sum_stderr.to_csv(f"./ground/{pop}_std_error.csv", index=False)