In [12]:
import numpy as np
import pandas as pd

df = pd.read_csv('data.csv')

unnessary_features = ['Observation_ID', 'Update_Datetime', 'Observation_Time', 
                      'Site_ID', 'Site_Name', 
                      'Individual_ID', 
                      'Phenophase_ID', 'Phenophase_Category', 'Phenophase_Description', 
                      'Intensity_Category_ID', 'Intensity_Value', 'Abundance_Value', 
                      'Kingdom', 'Species_Category']

df = df.drop(columns=unnessary_features)
df = df[df['Day_of_Year'] <= 365]

In [None]:
# ---------- Drop Phenophase_Status == -1 and report ----------
before = len(df)
df = df[df['Phenophase_Status'] != -1].copy()
after = len(df)
print(f"Rows before: {before}, after dropping Phenophase_Status==-1: {after}, dropped: {before-after}")

Rows before: 6503, after dropping Phenophase_Status==-1: 6487, dropped: 16


In [None]:
# ---------- Fix -9999 error codes in climate features ----------
climate_features = ['AGDD','Tmax','Tmin','Prcp','Accum_Prcp','Daylength']
# convert -9999 -> NaN
df[climate_features] = df[climate_features].replace(-9999, np.nan)

# convert Observation_Date to datetime & create year
df['Observation_Date'] = pd.to_datetime(df['Observation_Date'])
df['year'] = df['Observation_Date'].dt.year

# create site_id from lat/lon (simple deterministic encoding)
df['site_id'] = (df['Latitude'].round(4).astype(str) + '_' + df['Longitude'].round(4).astype(str))
# Optionally map to integers:
df['site_id'] = pd.factorize(df['site_id'])[0] + 1  # site_id as ints starting at 1

# create daily mean temperature (pyPhenology examples use "temperature")
df['temperature'] = (df['Tmax'] + df['Tmin']) / 2.0

In [None]:
# ---------- Interpolate climate missing values per site (time-series interpolation) ----------
# sort then interpolate by site
df = df.sort_values(['site_id','Observation_Date'])
df[climate_features] = df.groupby('site_id')[climate_features].transform(
    lambda g: g.interpolate(method='linear', limit_direction='both')
)
# if still NaN (edges), fill with column median (per-site median could be used)
df[climate_features] = df[climate_features].fillna(df[climate_features].median())

# also recompute temperature if it used NaNs earlier
df['temperature'] = (df['Tmax'] + df['Tmin']) / 2.0

In [None]:
# ---------- Create observations DataFrame (first bloom DOY) ----------
# define first bloom per site-species-year: min Day_of_Year where Phenophase_Status==1
bloom_obs = (
    df[df['Phenophase_Status'] == 1]
    .groupby(['site_id','Species','year'], as_index=False)['Day_of_Year']
    .min()
    .rename(columns={'Species':'species', 'Day_of_Year':'doy'})
)
# If you prefer use Species_ID instead of species string:
# bloom_obs = bloom_obs.rename(columns={'Species_ID':'species_id', ...})

# pyPhenology example uses a 'phenophase' id; set a single id (e.g. 501 for 'flowers')
bloom_obs['phenophase'] = 501

# final observations DataFrame columns: ['species','site_id','year','doy','phenophase']
observations = bloom_obs[['species','site_id','year','doy','phenophase']]

print("Observations (first bloom) sample:")
print(observations.head())

Observations (first bloom) sample:
  species  site_id  year  doy  phenophase
0  annuus        1  2018   76         501
1  annuus        3  2018  317         501
2  annuus        4  2019  131         501
3  annuus        5  2021  117         501
4  annuus        5  2022    4         501


In [None]:
# ---------- Create predictors DataFrame (daily temperature series) ----------
# pyPhenology predictors expected columns: at least ['site_id','temperature','year','doy']
predictors = df[['site_id','temperature','year','Day_of_Year']].rename(columns={'Day_of_Year':'doy'})
# ensure types
predictors = predictors.astype({'site_id': int, 'year': int, 'doy': float, 'temperature': float})

print("Predictors sample:")
print(predictors.head())

Predictors sample:
   site_id  temperature  year    doy
0        1        20.00  2018   76.0
1        1        20.00  2018   76.0
2        2        15.00  2018  150.0
3        2        15.00  2018  150.0
4        3        10.25  2018  317.0


In [30]:
predictors.describe()

Unnamed: 0,site_id,temperature,year,doy
count,6487.0,6487.0,6487.0,6487.0
mean,12.505164,16.553695,2019.226761,184.94173
std,2.222914,5.002215,2.216171,102.479229
min,1.0,-3.0,2016.0,1.0
25%,12.0,12.4025,2017.0,97.0
50%,12.0,16.5,2019.0,185.0
75%,12.0,20.25,2021.0,273.0
max,34.0,33.125,2023.0,365.0


In [None]:
# ---------- Split train/test (by year) ----------
years = sorted(observations['year'].unique())
# hold out the latest year as test
test_year = years[-1]
train_years = years[:-1]

obs_train = observations[observations['year'].isin(train_years)]
obs_test  = observations[observations['year'] == test_year]

preds_train = predictors[predictors['year'].isin(train_years)]
preds_test  = predictors[predictors['year'] == test_year]

print(f"Train site-year combos: {len(obs_train)}; Test site-year combos: {len(obs_test)}")

Train site-year combos: 46; Test site-year combos: 3


In [None]:
# ---------- Train model with pyPhenology ----------
from pyPhenology import models

# ThermalTime model (standard growing-degree-days)
model = models.ThermalTime()

# Fit on training observations + predictors
# NOTE: fit may take a bit; uses scipy optimizers internally.
model.fit(obs_train, preds_train)

# fitted parameters
params = model.get_params()
print("Fitted model parameters:", params)

  warn("""Dropped temperature data for doy {d} due to missing data. Most likely from leap year mismatch""".format(d=first_doy_column))
  warn("""Dropped temperature data for doy {d} due to missing data. Most likely from leap year mismatch""".format(d=last_doy_column))
 Missing data from: 
    site_id  year
0         1  2018
1         3  2018
2         4  2019
3         5  2021
4         5  2022
5         6  2021
6         7  2022
7         9  2016
8        10  2017
10       11  2017
11       12  2016
12       12  2017
13       12  2018
14       12  2019
15       12  2020
16       12  2021
17       12  2022
18       13  2016
19       13  2017
20       13  2018
21       13  2019
22       13  2020
23       13  2021
24       13  2022
25       14  2016
26       15  2017
27       16  2020
28       16  2021
29       16  2017
30       16  2018
31       16  2019
34       17  2017
35       17  2018
36       17  2019
37       19  2018
38       20  2018
39       21  2018
40       22  2018
41      

Fitted model parameters: {'t1': np.float64(-19.69325300434207), 'T': np.float64(1.9736485233480372), 'F': np.float64(824.1150140305742)}


In [None]:
# ---------- Predict on test ----------
# prepare to_predict - unique site/species/year combos we want predictions for
to_predict = obs_test[['site_id','species','year']].drop_duplicates().reset_index(drop=True)

# pyPhenology predict API: predict(to_predict, predictors)
# try that call and fall back to predict(predictors) if needed
try:
    pred_vals = model.predict(to_predict, preds_test)
except TypeError:
    # fallback (some versions accept only predictors)
    pred_vals = model.predict(preds_test)

# pred_vals expected to be an array/list of predicted DOY for each row in to_predict
if len(pred_vals) == len(to_predict):
    to_predict['predicted_doy'] = np.array(pred_vals).reshape(-1)
else:
    print(f"Warning: Model returned {len(pred_vals)} predictions for {len(to_predict)} test cases. Filling with NaN.")
    to_predict['predicted_doy'] = np.nan

# join with observed
eval_df = to_predict.merge(
    obs_test.rename(columns={'species':'species','doy':'observed_doy'}),
    on=['site_id','species','year'],
    how='left'
)

# compute RMSE / MAE only for rows with valid predictions
from sklearn.metrics import mean_squared_error, mean_absolute_error
valid = eval_df.dropna(subset=['observed_doy', 'predicted_doy'])
if len(valid) > 0:
    rmse = np.sqrt(mean_squared_error(valid['observed_doy'], valid['predicted_doy']))
    mae  = mean_absolute_error(valid['observed_doy'], valid['predicted_doy'])
    print(f"Test RMSE = {rmse:.3f} days; MAE = {mae:.3f} days")
else:
    print("No valid predictions to evaluate (all predicted_doy are NaN).")
print(eval_df.head())


No valid predictions to evaluate (all predicted_doy are NaN).
   site_id      species  year  predicted_doy  observed_doy  phenophase
0        8       annuus  2023            NaN           223         501
1       12  californica  2023            NaN           128         501
2       13  californica  2023            NaN           115         501


  warn("""Dropped temperature data for doy {d} due to missing data. Most likely from leap year mismatch""".format(d=first_doy_column))
  warn("""Dropped temperature data for doy {d} due to missing data. Most likely from leap year mismatch""".format(d=last_doy_column))
 Missing data from: 
   site_id  year
0        8  2023
1       12  2023
2       13  2023
  warn('Dropped {n0} of {n1} observations because of missing data'.format(n0=n_dropped, n1=original_sample_size) +


In [None]:
# ---------- Save trained model ----------
import joblib
joblib.dump(model, 'pyphenology_thermaltime_model.pkl')
print("Model saved to pyphenology_thermaltime_model.pkl")

Model saved to pyphenology_thermaltime_model.pkl
