In [1]:
import math
import pandas as pd
import numpy as np
import numpy.linalg as la

from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from statsmodels.tsa.arima.model import ARIMA

In [2]:
def csvIndexToDatetime(path, start, freq):
    df = pd.read_csv(path)
    df = df.drop(columns=['Unnamed: 0'])
    time_len = df.shape[0]
    num_nodes = df.shape[1]
    print(time_len, num_nodes)
    rng = pd.date_range(start, periods=time_len, freq=freq)
    idx = pd.DatetimeIndex(rng)
    df.index = idx
    return df, idx

In [3]:
def evaluation(a,b):
    rmse = math.sqrt(mean_squared_error(a,b))
    mae = mean_absolute_error(a, b)
    mape = mean_absolute_percentage_error(a, b)
    F_norm = la.norm(a-b)/la.norm(a)
    return rmse, mae, mape, 1-F_norm

In [4]:
def predictARIMA(df, idx, timelag):
    rmse, mae, mape, acc =  [], [], [], []
    
    for i in range(df.shape[1]):
        print(i, end = ': ')
        ts = df.iloc[:-timelag, i]
        ts_log = np.log(ts)
        ts_log = np.array(ts_log, dtype=np.float64)
        where_are_inf = np.isinf(ts_log)
        ts_log[where_are_inf] = 0
        ts_log = pd.Series(ts_log)
        ts_log.index = idx[:-timelag]
        print('Data Load Finish', end = ', ')
        
        model = ARIMA(ts_log, order=[1,0,0])
        properModel = model.fit()
        predict_ts = properModel.forecast(steps=timelag)
        log_recover = np.exp(predict_ts)
        testX = org_df.iloc[-timelag:, 0]
        print('Predict Finish', end = ', ')
        
        er_rmse, er_mae, er_mape, er_acc = evaluation(testX[-1], log_recover[-1])
        print(round(er_rmse,4), round(er_mae,4), round(er_mape,4), round(er_acc,4), end = ' ')
        
        rmse.append(er_rmse)
        mae.append(er_mae)
        mape.append(er_mape)
        acc.append(er_acc)
        print('Evaluation Finish')
        
    acc = np.mat(acc)
    acc[acc<0] = 0
    print('RMSE: ' + str(np.mean(rmse)) + ', MAE: ' + str(np.mean(mae)) + ', MAPE: ' + str(np.mean(mape))  + '\n'
          + 'ACC: ' + str(np.mean(acc)))

In [5]:
path_0 = '../Data/Gangnam/speed_gangnam_0.csv'
path_5 = '../Data/Gangnam/speed_gangnam_5.csv'
path_10 = '../Data/Gangnam/speed_gangnam_10.csv'
path_20 = '../Data/Gangnam/speed_gangnam_20.csv'

In [6]:
start = '1/10/2020'
freq = '5min'

In [7]:
org_path = '../Data/Gangnam/speed_gangnam_0.csv'
org_df, org_idx = csvIndexToDatetime(org_path, start, freq)

2880 506


### Unobserved Nodes = 0%

In [10]:
mean_squared_error([1], 3)

TypeError: Expected sequence or array-like, got <class 'int'>

In [8]:
df, idx = csvIndexToDatetime(path_0, start, freq)
(df == 0).sum().sum()

2880 506


0

In [9]:
predictARIMA(df, idx, 3)

0: Data Load Finish, Predict Finish, 

TypeError: Singleton array 31.77 cannot be considered a valid collection.

### Unobserved Nodes = 5%

In [None]:
df, idx = csvIndexToDatetime(path_5, start, freq)
(df == 0).sum().sum()

In [None]:
predictARIMA(df, idx, 3)

### Unobserved Nodes = 10%

In [None]:
df, idx = csvIndexToDatetime(path_10, start, freq)
(df == 0).sum().sum()

In [None]:
predictARIMA(df, idx, 3)

### Unobserved Nodes = 20%

In [None]:
df, idx = csvIndexToDatetime(path_20, start, freq)
(df == 0).sum().sum()

In [None]:
predictARIMA(df, idx, 3)