In [1]:
from _utils import *
from config import *

import time
import argparse
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import sparse
from scipy.sparse.linalg import spsolve as spsolve

import warnings
warnings.filterwarnings('ignore')

np.random.seed(28)

# 1 Functions

In [2]:
class args_config():
    def __init__(self, dataset_name):
        self.model_name = 'HaLRTC'
        self.model_result_save_path = os.path.join(result_folder_path, self.model_name)
        create_folder(self.model_result_save_path)

        self.dataset_name = dataset_name
        self.dataset_result_save_path = os.path.join(self.model_result_save_path, self.dataset_name)
        create_folder(self.dataset_result_save_path)
    
    def experiment_config(self, experiment_name):
        self.experiment_result_save_path = os.path.join(self.dataset_result_save_path, experiment_name)
        create_folder(self.experiment_result_save_path)

        data_complete_path = os.path.join(self.experiment_result_save_path, 'data_complete.csv')
        best_data_complete_path = os.path.join(self.experiment_result_save_path, 'best_data_complete.csv')
        metrics_analysis_path = os.path.join(self.experiment_result_save_path, 'metrics_analysis')
        plt_result_path = os.path.join(self.experiment_result_save_path, 'plt_result.png')
        final_complete_save_path = os.path.join(self.experiment_result_save_path, 'final_complete.csv')
        create_folder([metrics_analysis_path])
        
        parser = argparse.ArgumentParser(description='data complete configs')
        parser.add_argument('--data_complete_path', type=str, default=data_complete_path)
        parser.add_argument('--best_data_complete_path', type=str, default=best_data_complete_path)
        parser.add_argument('--metrics_analysis_path', type=str, default=metrics_analysis_path)
        parser.add_argument('--plt_result_path', type=str, default=plt_result_path)
        parser.add_argument('--final_complete_save_path', type=str, default=final_complete_save_path)
        args = parser.parse_known_args()[0]
        return args

In [3]:
flow_sim_args = args_config('flow_sim')
flow_zcity_args = args_config('flow_zcity')
pres_sim_args = args_config('pres_sim')
pres_zcity_args = args_config('pres_zcity')

In [4]:
def generate_Psi(dim_time, time_lags):
    Psis = []
    max_lag = np.max(time_lags)
    for i in range(len(time_lags) + 1):
        row = np.arange(0, dim_time - max_lag)  
        if i == 0:
            col = np.arange(0, dim_time - max_lag) + max_lag
        else:
            col = np.arange(0, dim_time - max_lag) + max_lag - time_lags[i - 1]
        data = np.ones(dim_time - max_lag)
        Psi = sparse.coo_matrix((data, (row, col)), shape = (dim_time - max_lag, dim_time))
        Psis.append(Psi)
    return Psis

In [5]:
def print_result(it, tol, var, var_hat):
    print('Iter: {}'.format(it))
    print('Tolerance: {:.6}'.format(tol))
    num = len(var)
    mape = compute_mape(var, var_hat)
    rmse = compute_rmse(var, var_hat)
    smape = compute_smape(var, var_hat)
    print('complete Num: {}'.format(num))
    print('complete MAPE: {:.6}'.format(mape))
    print('complete RMSE: {:.6}'.format(rmse))
    print('complete SMAPE: {:.6}'.format(smape))
    return mape, rmse, smape

In [6]:
def compute_true_metrics(raw_data_value, denorm_sparse_value, denorm_data_complete_value):
    pos_test = np.where((raw_data_value != 0) & (denorm_sparse_value == 0))
    true_mape = compute_mape(raw_data_value[pos_test], denorm_data_complete_value[pos_test])
    true_rmse = compute_rmse(raw_data_value[pos_test], denorm_data_complete_value[pos_test])
    true_smape = compute_smape(raw_data_value[pos_test], denorm_data_complete_value[pos_test])
    return true_mape, true_rmse, true_smape

In [7]:
def fill_data_after_preprocess(best_data_complete_path, data_after_preprocess_path, data_after_complete_path):
    best_data_complete_df = read_csv_data(best_data_complete_path)
    data_after_preprocess_df = read_csv_data(data_after_preprocess_path)
    data_after_preprocess_df = data_after_preprocess_df.astype('float64')
    zero_indices = np.where(data_after_preprocess_df == 0)
    zero_indices = list(zip(zero_indices[0], zero_indices[1]))
    for zero_index in zero_indices:
        data_after_preprocess_df.iloc[zero_index[0], zero_index[1]] = best_data_complete_df.iloc[zero_index[0], zero_index[1]]
    data_after_preprocess_df.to_csv(data_after_complete_path, index=True, header=True)
    print('Data after complete saved in {}'.format(data_after_complete_path))
    return

In [8]:
def svt(mat, tau):
    u, s, v = np.linalg.svd(mat, full_matrices = False)
    vec = s - tau
    vec[vec < 0] = 0
    return np.matmul(np.matmul(u, np.diag(vec)), v)

In [9]:
def HaLRTC_imputer(dense_tensor, sparse_tensor, alpha: list, rho: float, epsilon: float, maxiter: int):
    dim = np.array(sparse_tensor.shape)
    if np.isnan(sparse_tensor).any() == False:
        pos_miss = np.where(sparse_tensor == 0)
        pos_test = np.where((dense_tensor != 0) & (sparse_tensor == 0))
    elif np.isnan(sparse_tensor).any() == True:
        pos_test = np.where((dense_tensor != 0) & (np.isnan(sparse_tensor)))
        sparse_tensor[np.isnan(sparse_tensor)] = 0
        pos_miss = np.where(sparse_tensor == 0)
    dense_test = dense_tensor[pos_test]
    del dense_tensor
    tensor_hat = sparse_tensor.copy()
    B = [np.zeros(sparse_tensor.shape) for _ in range(len(dim))]
    Y = [np.zeros(sparse_tensor.shape) for _ in range(len(dim))]
    last_ten = sparse_tensor.copy()
    snorm = np.linalg.norm(sparse_tensor)
    
    it = 0
    while True:
        rho = min(rho * 1.05, 1e5)
        for k in range(len(dim)):
            B[k] = mat2ten(svt(ten2mat(tensor_hat + Y[k] / rho, k), alpha[k] / rho), dim, k)
        tensor_hat[pos_miss] = ((sum(B) - sum(Y) / rho) / 3)[pos_miss]
        for k in range(len(dim)):
            Y[k] = Y[k] - rho * (B[k] - tensor_hat)
        tol = np.linalg.norm((tensor_hat - last_ten)) / snorm
        last_ten = tensor_hat.copy()
        it += 1
        if it % 50 == 0:
            print('Iter: {}'.format(it))
            print('Tolerance: {:.6}'.format(tol))
            print('MAPE: {:.6}'.format(compute_mape(dense_test, tensor_hat[pos_test])))
            print('RMSE: {:.6}'.format(compute_rmse(dense_test, tensor_hat[pos_test])))
            print()
        if (tol < epsilon) or (it >= maxiter):
            if it >= 30:
                break
    
    tensor_hat[tensor_hat < 1] = 1
    print_result(it, tol, dense_test, tensor_hat[pos_test])
    return tensor_hat

In [10]:
class HaLRTC_dataComplete():

    def __init__(self, dataset_name, experiment_name, experiment_args):
        if dataset_name == 'flow_sim':
            self.raw_dataset_path = flow_sim_path
        elif dataset_name == 'pres_sim':
            self.raw_dataset_path = pres_sim_path
        elif dataset_name == 'flow_zcity':
            self.raw_dataset_path = flow_zcity_path
        elif dataset_name == 'pres_zcity':
            self.raw_dataset_path = pres_zcity_path

        self.raw_data_df = read_csv_data(self.raw_dataset_path)
        self.experiment_args = experiment_args
        
        self.data_norm_for_completing_path = os.path.join(dataset_sparse_path, dataset_name, 'data_norm_for_completing.csv')
        self.sparse_mat_for_completing_path = os.path.join(dataset_sparse_path, dataset_name, experiment_name, 'sparse_mat_for_completing.csv')
        self.denorm_sparse_for_completing_path = os.path.join(dataset_sparse_path, dataset_name, experiment_name, 'denorm_sparse_for_completing.csv')

        self.data_complete_path = self.experiment_args.data_complete_path
        self.best_data_complete_path = self.experiment_args.best_data_complete_path
        self.metrics_analysis_path = self.experiment_args.metrics_analysis_path
        self.plt_result_path = self.experiment_args.plt_result_path
        self.final_complete_save_path = self.experiment_args.final_complete_save_path

        self._prepareForImputation()
        return
    
    def _prepareForImputation(self):
        self.norm_data_df = read_csv_data(self.data_norm_for_completing_path)
        self.dense_mat = self.norm_data_df.values
        self.dense_tensor = self.dense_mat.T.reshape([self.dense_mat.shape[1], -1, int(24*60/15)]).transpose(0, 2, 1)

        self.sparse_mat_df = read_csv_data(self.sparse_mat_for_completing_path)
        self.sparse_mat = self.sparse_mat_df.T.values
        self.sparse_tensor = self.sparse_mat.reshape([self.dense_mat.shape[1], -1, int(24*60/15)]).transpose(0, 2, 1)
        self.denorm_sparse_mat_df = read_csv_data(self.denorm_sparse_for_completing_path)
        return

    def halrtc_complete(self):
        metrics_all_sensors_df = pd.DataFrame(columns=['MAPE', 'RMSE', 'SMAPE'])
        metrics_single_sensor_df = pd.DataFrame(columns=self.raw_data_df.columns)
        
        start = time.time()
        alpha = np.ones(3) / 3
        rho = 1e-5
        epsilon = 1e-4
        maxiter = 200
        self.tensor_hat = HaLRTC_imputer(self.dense_tensor, self.sparse_tensor, alpha, rho, epsilon, maxiter)
        self.mat_hat = ten2mat(self.tensor_hat, 0)
        self.data_complete_df = pd.DataFrame(self.mat_hat.T, index=self.norm_data_df.index, columns=self.norm_data_df.columns)
        self.denorm_data_complete_df = reverse_one2hundred_normalization(self.data_complete_df, self.raw_data_df)
        
        self.denorm_data_complete_df = self.denorm_data_complete_df.round(3)
        self.denorm_data_complete_df.to_csv(self.data_complete_path, index=True, header=True)
            
        mape_all_sensors, rmse_all_sensors, smape_all_sensors = compute_true_metrics(self.raw_data_df.values, self.denorm_sparse_mat_df.values, self.denorm_data_complete_df.values)  
        print('MAPE: %.6f, RMSE: %.6f, SMAPE: %.6f' % (mape_all_sensors, rmse_all_sensors, smape_all_sensors))
        self.denorm_data_complete_df.to_csv(self.best_data_complete_path, index=True, header=True)

        mape_single_sensor_list, rmse_single_sensor_list, smape_single_sensor_list = result_analysis(self.raw_data_df, self.denorm_sparse_mat_df, self.denorm_data_complete_df)
        metrics_all_sensors_df.loc['all_sensors'] = [mape_all_sensors, rmse_all_sensors, smape_all_sensors]
        metrics_single_sensor_df.loc['MAPE'] = mape_single_sensor_list
        metrics_single_sensor_df.loc['RMSE'] = rmse_single_sensor_list
        metrics_single_sensor_df.loc['SMAPE'] = smape_single_sensor_list
        metrics_single_sensor_df.to_csv(os.path.join(self.metrics_analysis_path, 'metrics_single_sensor.csv'), index=True, header=True)
        metrics_all_sensors_df.to_csv(os.path.join(self.metrics_analysis_path, 'metrics_all_sensors.csv'), index=True, header=True)
        end = time.time()
        print('Running time: %d seconds.' % (end - start))
        return mape_single_sensor_list, rmse_single_sensor_list, smape_single_sensor_list
    
    def run(self):
        print('\n--------------------------- Beginning ---------------------------')
        best_mape_single_sensor_list, best_rmse_single_sensor_list, best_smape_single_sensor_list = self.halrtc_complete()
        fill_data_after_preprocess(self.best_data_complete_path, self.raw_dataset_path, self.final_complete_save_path)
        plt_result(self.raw_data_df, read_csv_data(self.best_data_complete_path), self.denorm_sparse_mat_df, self.plt_result_path,
                   best_mape_single_sensor_list, best_rmse_single_sensor_list, best_smape_single_sensor_list)
        print('\n--------------------------- Ending ---------------------------')       


# 2 Experiments

------------

## 2.1 30% Random Missing

In [None]:
experiment_name = 'random_0.3'
dataset_name_list = ['flow_sim', 'flow_zcity', 'pres_sim', 'pres_zcity']

for dataset_name in dataset_name_list:
    
    if dataset_name == 'flow_sim':
        experiment_args = flow_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'flow_zcity':
        experiment_args = flow_zcity_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_sim':
        experiment_args = pres_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_zcity':
        experiment_args = pres_zcity_args.experiment_config(experiment_name)
    halrtc_dataComplete = HaLRTC_dataComplete(dataset_name, experiment_name, experiment_args)
    halrtc_dataComplete.run()

## 2.2 60% Random Missing

In [None]:
experiment_name = 'random_0.6'
dataset_name_list = ['flow_sim', 'flow_zcity', 'pres_sim', 'pres_zcity']

for dataset_name in dataset_name_list:
    
    if dataset_name == 'flow_sim':
        experiment_args = flow_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'flow_zcity':
        experiment_args = flow_zcity_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_sim':
        experiment_args = pres_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_zcity':
        experiment_args = pres_zcity_args.experiment_config(experiment_name)
    halrtc_dataComplete = HaLRTC_dataComplete(dataset_name, experiment_name, experiment_args)
    halrtc_dataComplete.run()

## 2.3 90% Random Missing

In [None]:
experiment_name = 'random_0.9'
dataset_name_list = ['flow_sim', 'flow_zcity', 'pres_sim', 'pres_zcity']

for dataset_name in dataset_name_list:
    
    if dataset_name == 'flow_sim':
        experiment_args = flow_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'flow_zcity':
        experiment_args = flow_zcity_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_sim':
        experiment_args = pres_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_zcity':
        experiment_args = pres_zcity_args.experiment_config(experiment_name)
    halrtc_dataComplete = HaLRTC_dataComplete(dataset_name, experiment_name, experiment_args)
    halrtc_dataComplete.run()

## 2.4 30% Long-Range Missing

In [None]:
experiment_name = 'long_range_0.3'
dataset_name_list = ['flow_sim', 'flow_zcity', 'pres_sim', 'pres_zcity']

for dataset_name in dataset_name_list:
    
    if dataset_name == 'flow_sim':
        experiment_args = flow_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'flow_zcity':
        experiment_args = flow_zcity_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_sim':
        experiment_args = pres_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_zcity':
        experiment_args = pres_zcity_args.experiment_config(experiment_name)
    halrtc_dataComplete = HaLRTC_dataComplete(dataset_name, experiment_name, experiment_args)
    halrtc_dataComplete.run()

## 2.5 60% Long-Range Missing

In [None]:
experiment_name = 'long_range_0.6'
dataset_name_list = ['flow_sim', 'flow_zcity', 'pres_sim', 'pres_zcity']

for dataset_name in dataset_name_list:
    
    if dataset_name == 'flow_sim':
        experiment_args = flow_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'flow_zcity':
        experiment_args = flow_zcity_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_sim':
        experiment_args = pres_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_zcity':
        experiment_args = pres_zcity_args.experiment_config(experiment_name)
    halrtc_dataComplete = HaLRTC_dataComplete(dataset_name, experiment_name, experiment_args)
    halrtc_dataComplete.run()

## 2.6 30% Block Missing

In [None]:
experiment_name = 'block_0.3'
dataset_name_list = ['flow_sim', 'flow_zcity', 'pres_sim', 'pres_zcity']

for dataset_name in dataset_name_list:
    
    if dataset_name == 'flow_sim':
        experiment_args = flow_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'flow_zcity':
        experiment_args = flow_zcity_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_sim':
        experiment_args = pres_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_zcity':
        experiment_args = pres_zcity_args.experiment_config(experiment_name)
    halrtc_dataComplete = HaLRTC_dataComplete(dataset_name, experiment_name, experiment_args)
    halrtc_dataComplete.run()

## 2.7 60% Block Missing

In [None]:
experiment_name = 'block_0.6'
dataset_name_list = ['flow_sim', 'flow_zcity', 'pres_sim', 'pres_zcity']

for dataset_name in dataset_name_list:
    
    if dataset_name == 'flow_sim':
        experiment_args = flow_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'flow_zcity':
        experiment_args = flow_zcity_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_sim':
        experiment_args = pres_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_zcity':
        experiment_args = pres_zcity_args.experiment_config(experiment_name)
    halrtc_dataComplete = HaLRTC_dataComplete(dataset_name, experiment_name, experiment_args)
    halrtc_dataComplete.run()

## 2.8 30% Mix Missing

In [None]:
experiment_name = 'mix_0.3'
dataset_name_list = ['flow_sim', 'flow_zcity', 'pres_sim', 'pres_zcity']

for dataset_name in dataset_name_list:
    
    if dataset_name == 'flow_sim':
        experiment_args = flow_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'flow_zcity':
        experiment_args = flow_zcity_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_sim':
        experiment_args = pres_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_zcity':
        experiment_args = pres_zcity_args.experiment_config(experiment_name)
    halrtc_dataComplete = HaLRTC_dataComplete(dataset_name, experiment_name, experiment_args)
    halrtc_dataComplete.run()

## 2.9 50% Mix Missing

In [None]:
experiment_name = 'mix_0.5'
dataset_name_list = ['flow_sim', 'flow_zcity', 'pres_sim', 'pres_zcity']

for dataset_name in dataset_name_list:
    
    if dataset_name == 'flow_sim':
        experiment_args = flow_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'flow_zcity':
        experiment_args = flow_zcity_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_sim':
        experiment_args = pres_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_zcity':
        experiment_args = pres_zcity_args.experiment_config(experiment_name)
    halrtc_dataComplete = HaLRTC_dataComplete(dataset_name, experiment_name, experiment_args)
    halrtc_dataComplete.run()

## 2.10 70% Mix Missing

In [None]:
experiment_name = 'mix_0.7'
dataset_name_list = ['flow_sim', 'flow_zcity', 'pres_sim', 'pres_zcity']

for dataset_name in dataset_name_list:
    
    if dataset_name == 'flow_sim':
        experiment_args = flow_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'flow_zcity':
        experiment_args = flow_zcity_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_sim':
        experiment_args = pres_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_zcity':
        experiment_args = pres_zcity_args.experiment_config(experiment_name)
    halrtc_dataComplete = HaLRTC_dataComplete(dataset_name, experiment_name, experiment_args)
    halrtc_dataComplete.run()