In [21]:
import numpy as np
import pandas as pd

def data_loader_air (file_name = 'datasets/pm25_missing.csv', seq_len = 36) :
  """Load complete data and introduce missingness.
  
  Args:
    - file_name: the location of file to be loaded
    - seq_len: sequence length
    - missing_rate: rate of missing data to be introduced
    
  Returns:
    - x: data with missing values
    - m: observation indicator (m=1: observe, m=0: missing)
    - t: time information (time difference between two measurments)
    - ori_x: original data without missing values (for evaluation)
  """
  
  # Load the dataset
  df = pd.read_csv('datasets/pm25_missing_value.csv')
  ori_df = pd.read_csv('datasets/pm25_ground_value.csv')

  df_scaler = (df - df.mean()) / df.std()
  ori_scaler = (ori_df - df.mean()) / df.std()

  data = df_scaler.values
  ori_data = ori_scaler.values
  data = data[::-1]
  ori_data = ori_data[::-1]
  
  # Parameters
  no, dim = data.shape
  no = no // seq_len

  masks = ~np.isnan(data)
  gt_masks = ~np.isnan(ori_data)
  
  # Define original data
  ori_x = list()  
  for i in range(no):
    temp_ori_x = ori_data[i*seq_len:((i+1)*seq_len)]
    ori_x = ori_x + [temp_ori_x]
  
  x = list()
  for i in range(no):
    temp_x = data[i:(i+seq_len)]
    x = x + [temp_x]
    
  # Introduce missingness
  m = list()
  gt_m = list()
  t = list()
  
  for i in range(no):
    # m
    temp_m = masks[i:(i+seq_len)]
    m = m + [temp_m]

    gt_temp_m = gt_masks[i:(i+seq_len)]
    gt_m = gt_m + [gt_temp_m]
    
    # t
    temp_t = np.ones([seq_len, dim])
    for j in range(dim):
      for k in range(1, seq_len):
        if temp_m[k, j] == 0:
          temp_t[k, j] = temp_t[k-1, j] + 1
    t = t + [temp_t]
    
  # Convert into 3d numpy array
  x = np.asarray(x)
  m = np.asarray(m)
  gt_m = np.asarray(gt_m)
  t = np.asarray(t)
  ori_x = np.asarray(ori_x)  
  
  # Fill 0 to the missing values
  x = np.nan_to_num(x, 0.)
  ori_x = np.nan_to_num(ori_x, 0.)

  return x, m, t, ori_x, gt_m

In [14]:
import pandas as pd
df = pd.read_csv("./datasets/pm25_missing.csv")
gt = pd.read_csv("./datasets/pm25_ground.csv")

In [12]:
df= df[df.columns[1:]]

In [13]:
df.to_csv("./datasets/pm25_missing_value.csv", index= False)

In [15]:
gt= gt[gt.columns[1:]]

In [16]:
gt.to_csv("./datasets/pm25_ground_value.csv", index= False)

In [26]:
x, m, t, ori_x,gt_m = data_loader_air()

In [27]:
x.shape

(8723, 36, 36)

In [28]:
m.shape

(8723, 36, 36)

In [29]:
x, m, t, ori_x= data_loader()

In [5]:
x.shape

(725, 7, 5)

In [30]:
m.shape

(725, 7, 5)

In [31]:
t.shape

(725, 7, 5)

In [6]:
ori_x.shape

(725, 7, 5)

In [25]:
def imputation_performance_air (ori_x, imputed_x, gt_m, m, metric_name):
  """Performance metrics for imputation.
  
  Args:
    - ori_x: original complete data (without missing values)
    - imputed_x: imputed data from incomplete data
    - m: observation indicator
    - metric_name: mae, mse, or rmse
    
  Returns:
    - performance: imputation performance in terms or mae, mse, or rmse
  """
  
  assert metric_name in ['mae','mse','rmse']
  
  no, seq_len, dim = ori_x.shape
  
  # Reshape 3d array to 2d array
  ori_x = np.reshape(ori_x, [no * seq_len, dim])
  imputed_x = np.reshape(imputed_x, [no * seq_len, dim])
  m = np.reshape(m, [no * seq_len, dim])
  gt_m = np.reshape(gt_m, [no * seq_len, dim])
  
  # Only compute the imputation performance if m = 0 (missing)
  if metric_name == 'mae':
    performance = mean_absolute_error(ori_x, imputed_x, gt_m - m)
  elif metric_name == 'mse':
    performance = mean_squared_error(ori_x, imputed_x, gt_m - m)
  elif metric_name == 'rmse':
    performance = np.sqrt(mean_squared_error(ori_x, imputed_x, gt_m - m))
    
  return performance

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 14)

In [37]:
mx = np.zeros([1,36,1])

In [41]:
idx = np.where(mx[0,:,0]==1)[0]

In [53]:
8759 // 36

243

In [49]:
import pickle
with open("output.pickle", 'rb') as fw:
    data = pickle.load(fw)

UnicodeDecodeError: 'ascii' codec can't decode byte 0x8e in position 2: ordinal not in range(128)

In [51]:
import pickle

# 바이너리 파일을 열 때는 'rb' 모드로 열기
with open("output.pickle", 'rb') as fw:
    data = pickle.load(fw)


UnicodeDecodeError: 'ascii' codec can't decode byte 0x8e in position 2: ordinal not in range(128)