### Statistics on Delta T Values
- Run algorithms in `calendar::delta_t` on the test data and compare the algorithms 
- Data collected from https://maia.usno.navy.mil/ser7/deltat.data

In [1]:
import common
from datetime import datetime

assert common.USNO_DATA_PATH.exists(), 'Dataset not found'

In [2]:
lines = filter(
  lambda l: len(l.strip()),
  common.USNO_DATA_PATH.open('r').readlines(),
)

def parse_line(line: str) -> tuple[float, float]:
  '''Parse a line from the dataset. Return (year, delta_t).'''
  year, month, day, delta_t = line.split()

  ymd: datetime = datetime(int(year), int(month), int(day))
  year_len: int = (datetime(int(year) + 1, 1, 1) - datetime(int(year), 1, 1)).days
  past_len: int = (ymd - datetime(int(year), 1, 1)).days

  fraction: float = past_len / year_len
  
  return float(year) + fraction, float(delta_t)

parsed = list(map(parse_line, lines))

In [3]:
parsed[::41]

[(1973.0849315068492, 43.4724),
 (1976.4972677595629, 46.997),
 (1979.9150684931508, 50.4599),
 (1983.3287671232877, 53.3024),
 (1986.7479452054795, 55.1898),
 (1990.1616438356164, 56.9755),
 (1993.5808219178082, 59.6343),
 (1997.0, 62.295),
 (2000.4153005464482, 63.9691),
 (2003.8328767123287, 64.5544),
 (2007.2465753424658, 65.2494),
 (2010.6657534246576, 66.2349),
 (2014.0849315068492, 67.3136),
 (2017.495890410959, 68.8245),
 (2020.9153005464482, 69.363),
 (2024.3306010928961, 69.2018)]

In [4]:
import numpy as np
from common import delta_t_algo1, delta_t_algo2, delta_t_algo3, delta_t_algo4

years    = np.array([y for y, _ in parsed])
observed = np.array([v for _, v in parsed])

vf_algo1 = np.vectorize(delta_t_algo1)
vf_algo2 = np.vectorize(delta_t_algo2)
vf_algo3 = np.vectorize(delta_t_algo3)
vf_algo4 = np.vectorize(delta_t_algo4)

algo1_pred = vf_algo1(years)
algo2_pred = vf_algo2(years)
algo3_pred = vf_algo3(years)
algo4_pred = vf_algo4(years)

print(f'{len(parsed)} - {len(algo1_pred)} - {len(algo2_pred)} - {len(algo3_pred)} - {len(algo4_pred)}')


617 - 617 - 617 - 617 - 617


In [5]:
import pandas as pd

df = pd.DataFrame({
  'year':     years,
  'observed': observed,
  'algo1':    algo1_pred,
  'algo2':    algo2_pred,
  'algo3':    algo3_pred,
  'algo4':    algo4_pred,
})

df.iloc[300::33, :]

Unnamed: 0,year,observed,algo1,algo2,algo3,algo4
300,1998.084932,63.0217,62.904837,62.994011,62.994011,62.994011
333,2000.833333,64.04,64.036667,64.098147,64.098147,64.098147
366,2003.580822,64.5371,64.586164,64.484109,64.484109,64.484109
399,2006.328767,64.948,65.231507,65.182797,65.079329,64.935156
432,2009.084932,65.8025,66.333973,66.308186,65.886885,65.813947
465,2011.832877,66.5383,67.433151,67.514753,66.692033,66.551103
498,2014.580822,67.4989,68.532329,68.805727,67.497181,67.516171
531,2017.328767,68.7623,73.282616,70.181108,68.490395,68.697633
564,2020.084699,69.3752,77.459776,71.645263,69.576156,69.389833
597,2022.832877,69.1942,81.672074,73.189839,70.718996,69.197565


In [6]:
from sklearn import metrics

results = []

results.append(('MAE', 
                metrics.mean_absolute_error(observed, algo1_pred),
                metrics.mean_absolute_error(observed, algo2_pred),
                metrics.mean_absolute_error(observed, algo3_pred),
                metrics.mean_absolute_error(observed, algo4_pred),))

results.append(('MSE', 
                metrics.mean_squared_error(observed, algo1_pred),
                metrics.mean_squared_error(observed, algo2_pred),
                metrics.mean_squared_error(observed, algo3_pred),
                metrics.mean_squared_error(observed, algo4_pred),))

results.append(('RMSE', 
                metrics.root_mean_squared_error(observed, algo1_pred),
                metrics.root_mean_squared_error(observed, algo2_pred),
                metrics.root_mean_squared_error(observed, algo3_pred),
                metrics.root_mean_squared_error(observed, algo4_pred),))

results.append(('R2', 
                metrics.r2_score(observed, algo1_pred),
                metrics.r2_score(observed, algo2_pred),
                metrics.r2_score(observed, algo3_pred),
                metrics.r2_score(observed, algo4_pred),))

results.append(('MAPE', 
                metrics.mean_absolute_percentage_error(observed, algo1_pred),
                metrics.mean_absolute_percentage_error(observed, algo2_pred),
                metrics.mean_absolute_percentage_error(observed, algo3_pred),
                metrics.mean_absolute_percentage_error(observed, algo4_pred),))

results.append(('Max Error', 
                metrics.max_error(observed, algo1_pred),
                metrics.max_error(observed, algo2_pred),
                metrics.max_error(observed, algo3_pred),
                metrics.max_error(observed, algo4_pred),))

pd.DataFrame(results, columns=['Metric', 'Algo1', 'Algo2', 'Algo3', 'Algo4'])

Unnamed: 0,Metric,Algo1,Algo2,Algo3,Algo4
0,MAE,1.682193,0.619127,0.157011,0.03135
1,MSE,14.443584,1.537807,0.161808,0.001619
2,RMSE,3.800472,1.240083,0.402254,0.040242
3,R2,0.7399,0.972307,0.997086,0.999971
4,MAPE,0.0247,0.009124,0.002372,0.000546
5,Max Error,14.914294,4.913019,2.199797,0.134313


In [7]:
'Apparently Algo4 is the best on the test data.'

'Apparently Algo4 is the best on the test data.'