# Decomposition of the CO$_2$ time series.

This notebook descompones the CO$_2$ time series with the best model selected in '2_Best_ETS_model' notebook and extracts the residuals. It creates new .csv files with the time series of the deterministic and stochastic components at each height.

Best model: cfg=['add',True,'mul',xx,True, 3]

## Libraries

In [5]:
import numpy as np
import pandas as pd
import glob
import ast
import warnings
import statsmodels.api as sm
from math import sqrt
from datetime import *
from multiprocessing import cpu_count
from joblib import Parallel
from joblib import delayed
from warnings import catch_warnings
from warnings import filterwarnings
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_squared_error
from numpy import array
from sklearn import metrics
from sklearn.model_selection import ParameterGrid

## Paths

In [6]:
pathr='/home/_ehoyos/Documents/Data_CO2/Final_info/' # path to read the data.
paths='/home/_ehoyos/Documents/Data_CO2/Final_results/' # path to save the results.

### Definition of functions

In [7]:
def exp_smoothing_forecast_r(history,config):
    t,d,s,p,r,n=config
    history=array(history)
    model=ExponentialSmoothing(history,trend=t,damped_trend=d,seasonal=s,seasonal_periods=p)
    model_fit=model.fit(optimized=True,remove_bias=r)
    yhat=model_fit.predict(0,len(history)-1)
    return yhat

def walk_forward_validation_r(dataf2,cfg):
    predictions=list()
    train=dataf2
    history=[x for x in train]
    yhat=exp_smoothing_forecast_r(history,cfg)
    warnings.filterwarnings('ignore')
    return yhat
def get_nnan_inds(series):
    series=series.reset_index(drop=True)
    index=series[series.notna()].index.to_numpy()
    if len(index)==0:
        return []
    indices=np.split(index,np.where(np.diff(index)>1)[0]+1)
    return [(ind[0],ind[-1]+1,ind[-1]+1-ind[0]) for ind in indices]

### Procedure

In [19]:
# Read data
filesf=glob.glob(pathr+"/*.csv")
namesf=["" for i in range(len(filesf))] # names of ICOS sites.
for i in range(len(filesf)):
    namesf[i]=filesf[i].split('/')[len(filesf[i].split('/'))-1].split('.')[0].split('_')[0]

for ii in range(len(namesf)):
#for ii in range(46,47):
    print('i='+str(ii),namesf[ii])
    w=namesf.index(namesf[ii])
    dataf=pd.read_csv(filesf[w],skiprows=11)
    line=open(filesf[w], "r").readlines()[0:8]
    code=line[1].split(',')[1].strip()
    site=line[0].split(',')[1].strip()
    code=line[1].split(',')[1].strip()
    country=line[2].split(',')[1].strip()
    latitude=line[3].split(',')[1].strip()
    longitude=line[4].split(',')[1].strip()
    altitude=line[5].split(',')[1].strip()
    units=line[6].split(',')[1].strip()
    nHeights=int(line[7].split(',')[1].strip())
    dataf["date"]=pd.to_datetime(dataf["date"])
    titlesf=dataf.columns[7+2*nHeights:7+2*nHeights+nHeights]
    titles2=dataf.columns[7:7+nHeights]
    
### time resolution
    w=dataf['date'][1].minute-dataf['date'][0].minute
    if w==30:
        dt=0.5 #delta of time [d].
        xx=48.
    elif w==0:
        dt=1
        xx=24.
    else:
        print('dt is not 1 h or 0.5 h')
    
    cfg=['add',True,'mul',xx,True, 3]# Best model identified previously*.
    
# descompose the time series and extract the residuals.    
    model=np.empty((len(dataf))) # array with the modeled values (trend and seasonality).
    residuals_add=np.empty((len(dataf))) # array with the additive residuals of each site.
    residuals_mul=np.empty((len(dataf))) # array with the additive residuals of each site.
    determin=np.empty((len(dataf))) # array with the deterministic components of each site (trend and seasonality).
    residuals_add[:]=np.nan
    residuals_mul[:]=np.nan
    model[:]=np.nan
    determin[:]=np.nan
    for jj in range(nHeights):
        print(titles2[jj]+'('+str(jj+1)+'/'+str(nHeights)+')')
        series=dataf[titlesf[jj]]
        nnan_index=np.array(get_nnan_inds(series)) # start, end and lenght of consective non nan data.
        for i in range(nnan_index.shape[0]): 
            print(str(i)+'/'+str(nnan_index.shape[0]))
            if nnan_index[i,1]-nnan_index[i,0] >= 2*24/dt: # the method requires at least 2 periods.
                dataf2=series[nnan_index[i,0]:nnan_index[i,1]]
                fill=np.array(walk_forward_validation_r(dataf2,cfg))
                resid=dataf2-fill
                residuals_add[nnan_index[i,0]:nnan_index[i,1]]=resid
                resid=(dataf2-fill)/fill
                residuals_mul[nnan_index[i,0]:nnan_index[i,1]]=resid
                determin[nnan_index[i,0]:nnan_index[i,1]]=fill
        dataf[titles2[jj]+"_resid"]=residuals_add
        dataf[titles2[jj]+"_deterministic"]=determin  
    header='Site,'+site+'\nCode,'+code+'\nCountry,'+country+'\nLatitude,'+str(latitude)+'\nLongitude,'+str(longitude)+'\nAltitude,'+str(altitude)+'\nUnits,'+units+'\nnHeights,'+str(nHeights)+'\n'+'\n'
    with open(paths+code+'_decomposed.csv', 'w') as fp:
        fp.write(header)
    dataf.to_csv(paths+code+'_decomposed.csv',header=True,mode='a')  

i=46 HEL
CO2_110.0(1/1)
0/1
