# Best ETS model to decompose the CO$_2$ time series.

This notebook selects the best 5 ETS models decomposing time series of each site and selects the most frequent. Then, it creates .csv files with the results of each site and considering all sites together. 

## Libraries

In [2]:
import numpy as np
import pandas as pd
import glob
import ast
import warnings
import statsmodels.api as sm
from math import sqrt
from datetime import *
from multiprocessing import cpu_count
from joblib import Parallel
from joblib import delayed
from warnings import catch_warnings
from warnings import filterwarnings
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_squared_error
from numpy import array
from sklearn import metrics
from sklearn.model_selection import ParameterGrid

## Paths

In [3]:
pathr='/home/_ehoyos/Documents/Data_CO2/Final_info/' # path to read the data.
paths='/home/_ehoyos/Documents/Data_CO2/Final_results/' # path to save the results.

### Definition of functions

In [4]:
### Functions to select the best models
def exp_smoothing_forecast_b(history,config):
    t,d,s,p,r,n=config
    # define model
    history=array(history)
    model=ExponentialSmoothing(history,trend=t,damped_trend=d,seasonal=s,seasonal_periods=p)
    # fit model
    model_fit=model.fit(optimized=True,remove_bias=r)
    yhat=model_fit.predict(0,len(history)-1) # modified line.
    
    # estimate prediction error
    error,r2=measure_rmse(history,yhat) # added line
    return (error,r2,model_fit.params["smoothing_level"],model_fit.params["smoothing_trend"],
           model_fit.params["smoothing_seasonal"],model_fit.params["damping_trend"])

# root mean squared error or rmse
def measure_rmse(actual, predicted):
    error=sqrt(mean_squared_error(actual, predicted))
    r2=metrics.r2_score(actual,predicted) # I added this line.
    return (error,r2)

# walk-forward validation for univariate data
def walk_forward_validation_b(dataf2,cfg):
    train=dataf2 ; test=dataf2
    history=[x for x in train]
    error,r2,a1,b1,g1,f1=exp_smoothing_forecast_b(history,cfg) # modified line.
    return (error,r2,a1,b1,g1,f1)

# score a model, return None on failure
def score_model_b(dataf2,cfg,debug=False):
    result=None
    r2=None
    aa=None;bb=None;gg=None;ff=None #values of alpha, beta, gamma and phi.
    # convert config to a key
    key=str(cfg)
    # show all warnings and fail on exception if debugging
    if debug:
        result,r2,aa,bb,gg,ff=walk_forward_validation(dataf2,cfg)
    else:
        # one failure during model validation suggests an unstable config
        try:
            # never show warnings when grid searching, too noisy
            with catch_warnings():
                filterwarnings("ignore")
                result,r2,aa,bb,gg,ff=walk_forward_validation_b(dataf2,cfg)
        except:
            error=None
    return (key,result,r2,aa,bb,gg,ff)

# grid search configs
def grid_search_b(dataf,cfg_list,parallel=True):
    scores=None
    if parallel:
        # execute configs in parallel
        executor=Parallel(n_jobs=cpu_count(),backend='multiprocessing')
        tasks=(delayed(score_model_b)(dataf2,cfg) for cfg in cfg_list)
        scores=executor(tasks)
    else:
        scores=[score_model_b(dataf2,cfg) for cfg in cfg_list]
    # remove empty results
    scores=[r for r in scores if r[1] !=None]
    # sort configs by error, asc
    scores.sort(key=lambda tup:tup[1])
    return scores

# create a set of exponential smoothing configs to try
def exp_smoothing_configs_b(seasonal=[None]):
    models=list()
    c=0
    # define config lists
    t_params=['add','mul',None] # trend.
    d_params=[True,False] # damped trend.
    s_params=['add','mul',None] # seasonal.
    p_params=seasonal # seasonal period.
    r_params=[True,False] # remove bias.
    # create config instances
    for t in t_params:
        for d in d_params:
            for s in s_params:
                for p in p_params:
                    for r in r_params:
                        c=c+1
                        cfg=[t,d,s,p,r,c]
                        models.append(cfg)
    return models

### Function to define the start and end of consecutive missing data

def get_nnan_inds(series):
    series=series.reset_index(drop=True)
    index=series[series.notna()].index.to_numpy()
    if len(index)==0:
        return []
    indices=np.split(index,np.where(np.diff(index)>1)[0]+1)
    return [(ind[0],ind[-1]+1,ind[-1]+1-ind[0]) for ind in indices]

### Procedure

In [13]:
filesf=glob.glob(pathr+"/*.csv")
namesf=["" for i in range(len(filesf))] # names of sites.
for i in range(len(filesf)):
    namesf[i]=filesf[i].split('/')[len(filesf[i].split('/'))-1].split('.')[0].split('_')[0]

bbm_s=[] # array to save the best 5 models describing each site.
info_bbm=[]  

for ii in range(len(namesf)):
#for ii in range(46,47):
    bm_s=[] # array with all the best models describing the ii site.
    info_model=[]
    print('i='+str(ii),namesf[ii])
    start_run=datetime.now()
    w=namesf.index(namesf[ii])
    dataf=pd.read_csv(filesf[w],skiprows=11)
    line=open(filesf[w], "r").readlines()[0:8]
    code=line[1].split(',')[1].strip()
    dataf["date"]=pd.to_datetime(dataf["date"])
    nHeights=int(line[7].split(',')[1].strip())
    titles2=dataf.columns[7:7+nHeights] # columns of data for each height.
    titlesf=dataf.columns[7+2*nHeights:7+2*nHeights+nHeights]
    
### time resolution
    w=dataf['date'][1].minute-dataf['date'][0].minute
    if w==30:
        dt=0.5 #delta of time [d].
    elif w==0:
        dt=1
    else:
        print('dt is not 1 h or 0.5 h')
    minb=3*24/dt # minimum lenght to analyze.
    
    for jj in range(nHeights):
        print(titlesf[jj])
        series=dataf[titlesf[jj]]
        nnan_index=np.array(get_nnan_inds(series)) # start, end and lenght of consective non nan data.
        c=0; c1=-1
        for i in range(nnan_index.shape[0]): 
            c1=c1+1
            if nnan_index[i,2]>=minb:
                c=c+1# number of the decomposed portion.
                print(c,str(c1)+'/'+str(nnan_index.shape[0]))
                dataf2=series[nnan_index[i,0]:nnan_index[i,1]]
                cfg_list=exp_smoothing_configs_b(seasonal=[24/dt])
                scores=grid_search_b(dataf2,cfg_list)
                bm=[] # array with the number of the best models.
                for j in range(5):
                    bm.append(ast.literal_eval(scores[j][0])[5]) 
                info=np.append([titles2[jj],c,nnan_index[i,0],nnan_index[i,1],nnan_index[i,2]],bm)
                info_model.append(info)
                bm_s.append(bm)

### save the best 5 models of each part of the decomposed time series in a .csv file
    info_model_db=pd.DataFrame(info_model,
            columns=['Height','n','t0','tf','Length','m1','m2','m3','m4','m5'])
    info_model_db.to_csv(paths+code+'_best_ETSmodels.csv',header=True)

### select and save the best 5 models of each site and save them in a .csv file.
# the last line contains the best 5 models considerig all sites.
    bm_s=np.array(bm_s)
    values,counts=np.unique(bm_s,return_counts=True)
    if values.size>5:
        ind=np.argsort(-1*counts)[:5]
        val=values[ind]
    else:
        val=values
    
    bbm_s.append(val)
    info=np.append([code],val)
    info_bbm.append(info)
    
## 5 best models considering all sites.
values,counts=np.unique(bbm_s,return_counts=True)
if values.size>5:
    ind=np.argsort(-1*counts)[:5]
    best=values[ind]
    count=counts[ind]
else:
    best=values
    count=counts
print(best)

info=np.append(['all sites'],best)
info_bbm.append(info)
info=np.append(['count'],count)
info_bbm.append(info)

best_models_db=pd.DataFrame(info_bbm,
            columns=['Site','m1','m2','m3','m4','m5'])
best_models_db.to_csv(paths+'zBest_ETSmodels_all.csv',header=True)


i=46 HEL
CO2_110.0_fill
1 0/1
[ 1  2  3  4 21]
