In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt
import os
import csv
import pickle
from typing import Union, List
import pandas as pd
from MyDataSet import MultiMaskTimeSeriesDataset

def time_unit_to_nanoseconds(time_unit: str):
    # check_time_unit(time_unit)
    if time_unit == 'year':
        return 365.2425 * 24 * 60 * 60 * 10**9
    elif time_unit == 'week':
        time_unit = 'W'
    return pd.Timedelta('1' + time_unit).value

def datetime_encoded(dataset : pd.DataFrame, units: Union[str, List]) -> pd.DataFrame:
    r"""Transform dataset's temporal index into covariates using sinusoidal
    transformations. Each temporal unit is used as period to compute the
    operations, obtaining two feature (:math:`\sin` and :math:`\cos`) for
    each unit."""

    datetime = dict()
    for unit in units:
        nano_unit = time_unit_to_nanoseconds(unit)
        nano_sec =  dataset[unit]* (2 * np.pi / nano_unit)
        datetime[unit + '_sin'] = np.sin(nano_sec)
        datetime[unit + '_cos'] = np.cos(nano_sec)
    return pd.DataFrame(datetime, dtype=np.float32)


df = pd.DataFrame({'time': pd.date_range('2020-01-01', periods=288, freq='5min')})
df['day']=df['time'].dt.time
df['day']=df['day'].apply(lambda x: (x.hour*3600+x.minute*60+x.second)*10**9)
dateencoded=datetime_encoded(df, ['day'])
print(dateencoded.shape)

file_path=r'D:\WorkPath\Models\ImputeFormer\Data\raw_data\San Diego\Freeways-data_015.csv'

data=pd.read_csv(file_path,header=0)
node_num =len( data['ID'].unique())
date_len =len( data['date'].unique())
data=data.fillna(0)
data

In [None]:
speed_matrix = data["Average Speed"].values
speed_matrix=speed_matrix.reshape(date_len,288,node_num)
speed_matrix.shape

In [None]:

plt.figure(figsize=(15,6))
plt.imshow(speed_matrix[27,12:264,:].T,aspect='auto',cmap='jet',origin='lower')
plt.colorbar()
plt.show()

In [None]:
x=speed_matrix[:40,6*12:264,:].reshape(1,-1)
plt.hist(x[0],bins=100)
plt.show()

In [12]:
route = '015'
direction = 'N'
route_start = 0
route_len =32
missing_rate = 0.3
missing_type = ['random','linear','block','mixed']
missing_type_ind = 0
num_masks = 15
data_type=['source_train','source_test','target_train','target_test']
data_type_ind = 0

In [None]:
hot_map_data=speed_matrix[:40,:,:]
hot_map_data.shape

In [14]:
train_ratio = 0.8
test_ratio = 0.2
missing_rate = 0.3
random.seed(0)
x = list(range( int(hot_map_data.shape[0])  ))
random.shuffle(x)


for missing_type_ind in range(3):
    for route_start in range(0,51-32,6):
        for time_start in [6,10,14,18]:


            save_path=os.path.join(r'D:\WorkPath\Models\ImputeFormer\Data',data_type[data_type_ind])+'_SDG015'
            save_path_test=os.path.join(r'D:\WorkPath\Models\ImputeFormer\Data',data_type[data_type_ind+1])+'_SDG015'

            
            dataset = MultiMaskTimeSeriesDataset(hot_map_data[ x[0:int(train_ratio* hot_map_data.shape[0]) ],
                                                time_start*12:(time_start+4)*12,  route_start:route_start+route_len  ],
                                                 dateencoded.values[time_start*12:(time_start+4)*12,:],
                                                missing_rate, missing_type[missing_type_ind], num_masks)

            # 保存数据集
            dataset_file = os.path.join(save_path, 
                                        'SanDie_go_speed_{}-{}_{}_{}_{}.pkl'.format
                                        (route, time_start ,route_start ,int(missing_rate*100) , missing_type[missing_type_ind]))
            pd.to_pickle(dataset, dataset_file)
            
            dataset = MultiMaskTimeSeriesDataset(hot_map_data[ x[int(train_ratio* hot_map_data.shape[0]): ],
                                                time_start*12:(time_start+4)*12,route_start:route_start+route_len],
                                                 dateencoded.values[time_start*12:(time_start+4)*12,:],
                                                missing_rate, missing_type[missing_type_ind], 5)
            # 保存数据集
            dataset_file = os.path.join(save_path_test, 
                                        'SanDie_go_speed_{}-{}_{}_{}_{}.pkl'.format
                                        (route, time_start ,route_start ,int(missing_rate*100) , missing_type[missing_type_ind]))
            pd.to_pickle(dataset, dataset_file)