In [1]:
from _utils import *
from config import *
from sklearn.impute import KNNImputer

import time
import argparse
import warnings
warnings.filterwarnings('ignore')

np.random.seed(28)

# 1 Functions

In [2]:
class args_config():
    def __init__(self, dataset_name):
        self.model_name = 'KNN'
        self.model_result_save_path = os.path.join(result_folder_path, self.model_name)
        create_folder(self.model_result_save_path)

        self.dataset_name = dataset_name
        self.dataset_result_save_path = os.path.join(self.model_result_save_path, self.dataset_name)
        create_folder(self.dataset_result_save_path)
    
    def experiment_config(self, experiment_name):
        self.experiment_result_save_path = os.path.join(self.dataset_result_save_path, experiment_name)
        create_folder(self.experiment_result_save_path)

        data_complete_path = os.path.join(self.experiment_result_save_path, 'data_complete')
        best_data_complete_path = os.path.join(self.experiment_result_save_path, 'best_data_complete.csv')
        metrics_analysis_path = os.path.join(self.experiment_result_save_path, 'metrics_analysis')
        plt_result_path = os.path.join(self.experiment_result_save_path, 'plt_result.png')
        best_args_path = os.path.join(self.experiment_result_save_path, 'best_args_config.yml')
        final_complete_save_path = os.path.join(self.experiment_result_save_path, 'final_complete.csv')
        create_folder([data_complete_path, metrics_analysis_path])
        
        parser = argparse.ArgumentParser(description='data complete configs')
        parser.add_argument('--data_complete_path', type=str, default=data_complete_path)
        parser.add_argument('--best_data_complete_path', type=str, default=best_data_complete_path)
        parser.add_argument('--metrics_analysis_path', type=str, default=metrics_analysis_path)
        parser.add_argument('--plt_result_path', type=str, default=plt_result_path)
        parser.add_argument('--best_args_path', type=str, default=best_args_path)
        parser.add_argument('--final_complete_save_path', type=str, default=final_complete_save_path)
        args = parser.parse_known_args()[0]
        return args

In [3]:
flow_sim_args = args_config('flow_sim')
flow_zcity_args = args_config('flow_zcity')
pres_sim_args = args_config('pres_sim')
pres_zcity_args = args_config('pres_zcity')

In [4]:
def compute_true_metrics(raw_data_value, denorm_sparse_value, denorm_data_complete_value):
    pos_test = np.where((raw_data_value != 0) & (denorm_sparse_value == 0))
    true_mape = compute_mape(raw_data_value[pos_test], denorm_data_complete_value[pos_test])
    true_rmse = compute_rmse(raw_data_value[pos_test], denorm_data_complete_value[pos_test])
    true_smape = compute_smape(raw_data_value[pos_test], denorm_data_complete_value[pos_test])
    return true_mape, true_rmse, true_smape

In [5]:
def fill_data_after_preprocess(best_data_complete_path, data_after_preprocess_path, data_after_complete_path):
    best_data_complete_df = read_csv_data(best_data_complete_path)
    data_after_preprocess_df = read_csv_data(data_after_preprocess_path)
    data_after_preprocess_df = data_after_preprocess_df.astype('float64')
    zero_indices = np.where(data_after_preprocess_df == 0)
    zero_indices = list(zip(zero_indices[0], zero_indices[1]))
    for zero_index in zero_indices:
        data_after_preprocess_df.iloc[zero_index[0], zero_index[1]] = best_data_complete_df.iloc[zero_index[0], zero_index[1]]
    data_after_preprocess_df.to_csv(data_after_complete_path, index=True, header=True)
    print('Data after complete saved in {}'.format(data_after_complete_path))
    return

In [6]:
class KNN_dataComplete():

    def __init__(self, dataset_name, experiment_name, experiment_args):

        if dataset_name == 'flow_sim':
            self.raw_dataset_path = flow_sim_path
            self.n_neighbors_search_space = [5, 10, 15, 20, 30]
        elif dataset_name == 'pres_sim':
            self.raw_dataset_path = pres_sim_path
            self.n_neighbors_search_space = [5, 10, 15, 20, 30]
        elif dataset_name == 'flow_zcity':
            self.raw_dataset_path = flow_zcity_path
            self.n_neighbors_search_space = [5, 10, 15, 20, 30]
        elif dataset_name == 'pres_zcity':
            self.raw_dataset_path = pres_zcity_path
            self.n_neighbors_search_space = [5, 10, 15, 20, 30]

        self.experiment_args = experiment_args

        self.raw_data_df = read_csv_data(self.raw_dataset_path)

        self.sparse_for_completing_path = os.path.join(dataset_sparse_path, dataset_name, experiment_name, 'denorm_sparse_for_completing.csv')
        self.sparse_data_df = read_csv_data(self.sparse_for_completing_path)

        self.data_complete_path = self.experiment_args.data_complete_path
        self.best_data_complete_path = self.experiment_args.best_data_complete_path
        self.metrics_analysis_path = self.experiment_args.metrics_analysis_path
        self.plt_result_path = self.experiment_args.plt_result_path
        self.best_args_path = self.experiment_args.best_args_path
        self.final_complete_save_path = self.experiment_args.final_complete_save_path

    def knn_data_complete(self):

        metrics_all_sensors_df = pd.DataFrame(columns=['MAPE', 'RMSE', 'SMAPE'])
        mape_single_sensor_df = pd.DataFrame(columns=self.raw_data_df.columns)
        rmse_single_sensor_df = pd.DataFrame(columns=self.raw_data_df.columns)
        smape_single_sensor_df = pd.DataFrame(columns=self.raw_data_df.columns)
        best_rmse = np.inf
        best_n_neighbors = None
        best = False
        current_epoch = 1        
        total_epoch = len(self.n_neighbors_search_space)

        for n_neighbors in self.n_neighbors_search_space:
            start = time.time()
            print('\nepoch: %d/%d, n_neighbors: %d' % (current_epoch, total_epoch, n_neighbors))

            imputer = KNNImputer(n_neighbors=n_neighbors, weights='uniform', metric='nan_euclidean')
            sparse_data_df = self.sparse_data_df.replace(0, np.nan)
            data_complete_value = imputer.fit_transform(sparse_data_df)
            data_complete_df = pd.DataFrame(data_complete_value, columns=self.sparse_data_df.columns, index=self.sparse_data_df.index)
            save_path = os.path.join(self.data_complete_path, 'n_neighbors=' + str(n_neighbors) + '.csv')

            data_complete_df = data_complete_df.round(3)
            data_complete_df.to_csv(save_path, index=True, header=True)
  
            mape_all_sensors, rmse_all_sensors, smape_all_sensors = compute_true_metrics(self.raw_data_df.values, self.sparse_data_df.values, data_complete_value)
            print('MAPE: %.6f, RMSE: %.6f, SMAPE: %.6f' % (mape_all_sensors, rmse_all_sensors, smape_all_sensors))
            metrics_all_sensors_df.loc['n_neighbors=' + str(n_neighbors)] = [mape_all_sensors, rmse_all_sensors, smape_all_sensors]
            if rmse_all_sensors < best_rmse:
                best_rmse = rmse_all_sensors
                best_n_neighbors = n_neighbors
                best = True
            
            if best:
                data_complete_df.to_csv(self.best_data_complete_path, index=True, header=True)
                best = False

            mape_single_sensor_list, rmse_single_sensor_list, smape_single_sensor_list = result_analysis(self.raw_data_df, self.sparse_data_df, data_complete_df)
            mape_single_sensor_df.loc['n_neighbors=' + str(n_neighbors)] = mape_single_sensor_list
            rmse_single_sensor_df.loc['n_neighbors=' + str(n_neighbors)] = rmse_single_sensor_list
            smape_single_sensor_df.loc['n_neighbors=' + str(n_neighbors)] = smape_single_sensor_list
                
            end = time.time()
            print('Running time: %d seconds'%(end - start))
            current_epoch += 1
        
        metrics_all_sensors_df.to_csv(os.path.join(self.metrics_analysis_path, 'metrics_all_sensors.csv'), index=True, header=True)
        mape_single_sensor_df.to_csv(os.path.join(self.metrics_analysis_path, 'mape_single_sensor.csv'), index=True, header=True)
        rmse_single_sensor_df.to_csv(os.path.join(self.metrics_analysis_path, 'rmse_single_sensor.csv'), index=True, header=True)
        smape_single_sensor_df.to_csv(os.path.join(self.metrics_analysis_path, 'smape_single_sensor.csv'), index=True, header=True)
        best_mape_single_sensor_list = mape_single_sensor_df.loc['n_neighbors=' + str(best_n_neighbors)].values
        best_rmse_single_sensor_list = rmse_single_sensor_df.loc['n_neighbors=' + str(best_n_neighbors)].values
        best_smape_single_sensor_list = smape_single_sensor_df.loc['n_neighbors=' + str(best_n_neighbors)].values
        return best_n_neighbors, best_mape_single_sensor_list, best_rmse_single_sensor_list, best_smape_single_sensor_list
    
    def run(self):
        best_n_neighbors, best_mape_single_sensor_list, best_rmse_single_sensor_list, best_smape_single_sensor_list = self.knn_data_complete()
        
        write_yaml(self.best_args_path, {'best_n_neighbors': best_n_neighbors})
        fill_data_after_preprocess(self.best_data_complete_path, self.raw_dataset_path, self.final_complete_save_path)
        plt_result(self.raw_data_df, read_csv_data(self.best_data_complete_path), self.sparse_data_df, self.plt_result_path,
                   best_mape_single_sensor_list, best_rmse_single_sensor_list, best_smape_single_sensor_list)

# 2 Experiment

----

## 2.1 30% Random Missing

In [None]:
experiment_name = 'random_0.3'
dataset_name_list = ['flow_sim', 'flow_zcity', 'pres_sim', 'pres_zcity']

for dataset_name in dataset_name_list:

    if dataset_name == 'flow_sim':
        experiment_args = flow_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'flow_zcity':
        experiment_args = flow_zcity_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_sim':
        experiment_args = pres_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_zcity':
        experiment_args = pres_zcity_args.experiment_config(experiment_name)
    knn_dataComplete = KNN_dataComplete(dataset_name, experiment_name, experiment_args)
    knn_dataComplete.run()

## 2.2 60% Random Missing

In [None]:
experiment_name = 'random_0.6'
dataset_name_list = ['flow_sim', 'flow_zcity', 'pres_sim', 'pres_zcity']

for dataset_name in dataset_name_list:
    
    if dataset_name == 'flow_sim':
        experiment_args = flow_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'flow_zcity':
        experiment_args = flow_zcity_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_sim':
        experiment_args = pres_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_zcity':
        experiment_args = pres_zcity_args.experiment_config(experiment_name)
    knn_dataComplete = KNN_dataComplete(dataset_name, experiment_name, experiment_args)
    knn_dataComplete.run()

## 2.3 90% Random Missing

In [None]:
experiment_name = 'random_0.9'
dataset_name_list = ['flow_sim', 'flow_zcity', 'pres_sim', 'pres_zcity']

for dataset_name in dataset_name_list:
    
    if dataset_name == 'flow_sim':
        experiment_args = flow_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'flow_zcity':
        experiment_args = flow_zcity_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_sim':
        experiment_args = pres_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_zcity':
        experiment_args = pres_zcity_args.experiment_config(experiment_name)
    knn_dataComplete = KNN_dataComplete(dataset_name, experiment_name, experiment_args)
    knn_dataComplete.run()

## 2.4 30% Long-Range Missing

In [None]:
experiment_name = 'long_range_0.3'
dataset_name_list = ['flow_sim', 'flow_zcity', 'pres_sim', 'pres_zcity']

for dataset_name in dataset_name_list:
    
    if dataset_name == 'flow_sim':
        experiment_args = flow_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'flow_zcity':
        experiment_args = flow_zcity_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_sim':
        experiment_args = pres_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_zcity':
        experiment_args = pres_zcity_args.experiment_config(experiment_name)
    knn_dataComplete = KNN_dataComplete(dataset_name, experiment_name, experiment_args)
    knn_dataComplete.run()

## 2.5 60% Long-Range Missing

In [None]:
experiment_name = 'long_range_0.6'
dataset_name_list = ['flow_sim', 'flow_zcity', 'pres_sim', 'pres_zcity']

for dataset_name in dataset_name_list:
    
    if dataset_name == 'flow_sim':
        experiment_args = flow_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'flow_zcity':
        experiment_args = flow_zcity_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_sim':
        experiment_args = pres_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_zcity':
        experiment_args = pres_zcity_args.experiment_config(experiment_name)
    knn_dataComplete = KNN_dataComplete(dataset_name, experiment_name, experiment_args)
    knn_dataComplete.run()

## 2.6 30% Block Missing

In [None]:
experiment_name = 'block_0.3'
dataset_name_list = ['flow_sim', 'flow_zcity', 'pres_sim', 'pres_zcity']

for dataset_name in dataset_name_list:
    
    if dataset_name == 'flow_sim':
        experiment_args = flow_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'flow_zcity':
        experiment_args = flow_zcity_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_sim':
        experiment_args = pres_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_zcity':
        experiment_args = pres_zcity_args.experiment_config(experiment_name)
    knn_dataComplete = KNN_dataComplete(dataset_name, experiment_name, experiment_args)
    knn_dataComplete.run()

## 2.7 60% Block Missing

In [None]:
experiment_name = 'block_0.6'
dataset_name_list = ['flow_sim', 'flow_zcity', 'pres_sim', 'pres_zcity']

for dataset_name in dataset_name_list:
    
    if dataset_name == 'flow_sim':
        experiment_args = flow_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'flow_zcity':
        experiment_args = flow_zcity_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_sim':
        experiment_args = pres_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_zcity':
        experiment_args = pres_zcity_args.experiment_config(experiment_name)
    knn_dataComplete = KNN_dataComplete(dataset_name, experiment_name, experiment_args)
    knn_dataComplete.run()

## 2.8 30% Mix Missing

In [None]:
experiment_name = 'mix_0.3'
dataset_name_list = ['flow_sim', 'flow_zcity', 'pres_sim', 'pres_zcity']

for dataset_name in dataset_name_list:
    
    if dataset_name == 'flow_sim':
        experiment_args = flow_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'flow_zcity':
        experiment_args = flow_zcity_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_sim':
        experiment_args = pres_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_zcity':
        experiment_args = pres_zcity_args.experiment_config(experiment_name)
    knn_dataComplete = KNN_dataComplete(dataset_name, experiment_name, experiment_args)
    knn_dataComplete.run()

## 2.9 50% Mix Missing

In [None]:
experiment_name = 'mix_0.5'
dataset_name_list = ['flow_sim', 'flow_zcity', 'pres_sim', 'pres_zcity']

for dataset_name in dataset_name_list:
    
    if dataset_name == 'flow_sim':
        experiment_args = flow_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'flow_zcity':
        experiment_args = flow_zcity_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_sim':
        experiment_args = pres_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_zcity':
        experiment_args = pres_zcity_args.experiment_config(experiment_name)
    knn_dataComplete = KNN_dataComplete(dataset_name, experiment_name, experiment_args)
    knn_dataComplete.run()

## 2.10 70% Mix Missing

In [None]:
experiment_name = 'mix_0.7'
dataset_name_list = ['flow_sim', 'flow_zcity', 'pres_sim', 'pres_zcity']

for dataset_name in dataset_name_list:
    
    if dataset_name == 'flow_sim':
        experiment_args = flow_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'flow_zcity':
        experiment_args = flow_zcity_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_sim':
        experiment_args = pres_sim_args.experiment_config(experiment_name)
    elif dataset_name == 'pres_zcity':
        experiment_args = pres_zcity_args.experiment_config(experiment_name)
    knn_dataComplete = KNN_dataComplete(dataset_name, experiment_name, experiment_args)
    knn_dataComplete.run()