## Part 2

In [1]:
import numpy as np
import pandas as pd
import optuna
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn
from pmdarima.arima import auto_arima

from sklearn.model_selection import KFold, ShuffleSplit, cross_val_score, TimeSeriesSplit

from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import ElasticNet

from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from scipy.stats.mstats import winsorize
import warnings

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('data/cosmote.csv', parse_dates=True)
df = df.loc[df.PERIOD_START_TIME > '2022-01-01 23:00:00'].reset_index(drop=True)
df.PERIOD_START_TIME = pd.to_datetime(df.PERIOD_START_TIME)
df = df.drop(columns=['TCH_CONGESTION', 'TCH_BLOCKING', 'AVG_UL_MAC_UE_TPUT'])  # ? -> df.corr().energy_mean.sort_values(ascending=False)

In [3]:
# common_timeframes = []
# for group_name, group_data in df.groupby('ID'):
#     group_data = group_data.sort_values(by='PERIOD_START_TIME')
#     common_timeframe = pd.Interval(group_data['PERIOD_START_TIME'].min(), group_data['PERIOD_START_TIME'].max())
#     common_timeframes.append(common_timeframe)

# # Step 4: Calculate the overall common timeframe using IntervalIndex
# overall_common_timeframe = pd.IntervalIndex(common_timeframes).min()  # Using min to get the overall common timeframe

# print(common_timeframes)

In [3]:
df = df.loc[(df.PERIOD_START_TIME > '2022-01-07 23:00:00') & (df.PERIOD_START_TIME < '2022-03-08')]

In [4]:
df = df.fillna(method='ffill')
basedf = df.copy()
for col in df.columns[3:]:
    df[col] =  winsorize(df[col], limits=[0.01, 0.01])

In [21]:
results = pd.DataFrame()
checkpoint = pd.read_csv('out/interim_sarima_results.csv')
for station, stationdf in df.groupby('ID'):
    stationdf = stationdf.sort_values('PERIOD_START_TIME').reset_index(drop=True)
    print(station)
    # tscv = TimeSeriesSplit(n_splits=2, test_size=24*3)
    # split = list(tscv.split(stationdf))[1]

    # train = stationdf.iloc[split[0]]
    # test = stationdf.iloc[split[1]]

    for i, col in enumerate(stationdf.columns[3:]):
        print(f"Processing {col} for {station}")
        ts = stationdf.loc[:, col]
        tscv = TimeSeriesSplit(n_splits=3, test_size=24*1)
        
        for j, (train_index, test_index) in enumerate(tscv.split(ts)):
            test_exists = checkpoint.loc[(checkpoint.ID == station) & (checkpoint.column == col) & (checkpoint.split  == j+1)].any().sum() > 1
            if test_exists:
                print(f"{station},{col},{j+1} exists cont..")
                continue
            oos = 24
            model = auto_arima(ts.iloc[train_index], \
                               max_p=24, \
                               max_d=2, \
                               max_q=24, \
                               m=24, \
                               max_P=7, \
                               out_of_sample_size=oos, \
                               information_criterion='oob', \
                               stepwise=True, \
                               trace=True)
            params = model.to_dict()
            
            val_index = train_index[-oos:]
            train_index = train_index[:-oos]
            
            train_rmse = mean_squared_error(ts.iloc[train_index], model.fittedvalues()[train_index], squared=False)
            train_mape = mean_absolute_percentage_error(ts.iloc[train_index]+1, model.fittedvalues()[train_index]+1)
            
            val_rmse = mean_squared_error(ts.iloc[val_index], model.fittedvalues()[val_index], squared=False)
            val_mape = mean_absolute_percentage_error(ts.iloc[val_index]+1, model.fittedvalues()[val_index]+1)
            results_line = pd.DataFrame([{'ID': station,
                                        'column': col,
                                        'AR': params['order'][0],
                                        'I': params['order'][1],
                                        'MA': params['order'][2],
                                        'S-AR': params['seasonal_order'][0],
                                        'S-I': params['seasonal_order'][1],
                                        'S-MA': params['seasonal_order'][2],
                                        'train_rmse': train_rmse,
                                        'train_mape': train_mape,
                                        'val_rmse': val_rmse,
                                        'val_mape': val_mape,
                                        'split': j+1
                                        }])

            results = pd.concat([results, results_line], ignore_index=True)

89
Processing INCOMING_HO_SEIZURES for 89
89,INCOMING_HO_SEIZURES,1] exists cont..
89,INCOMING_HO_SEIZURES,2] exists cont..
89,INCOMING_HO_SEIZURES,3] exists cont..
Processing CALL_REQUESTS for 89
89,CALL_REQUESTS,1] exists cont..
89,CALL_REQUESTS,2] exists cont..
89,CALL_REQUESTS,3] exists cont..
Processing TCH_NORMAL_SEIZURES for 89
89,TCH_NORMAL_SEIZURES,1] exists cont..
89,TCH_NORMAL_SEIZURES,2] exists cont..
89,TCH_NORMAL_SEIZURES,3] exists cont..
Processing TCSH_TRAFFIC for 89
Performing stepwise search to minimize oob
 ARIMA(2,0,2)(1,0,1)[24] intercept   : OOB=inf, Time=11.85 sec
 ARIMA(0,0,0)(0,0,0)[24] intercept   : OOB=1346.487, Time=0.15 sec
 ARIMA(1,0,0)(1,0,0)[24] intercept   : OOB=inf, Time=3.73 sec


KeyboardInterrupt: 

In [None]:
results.to_csv('out/sarima_results.csv')