In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from datetime import datetime

In [3]:
data = Path('/data/Datastore/Beijing') / 'pm2_5_df.h5'
data.exists()

In [4]:
df = pd.read_hdf(data, key='df')

In [5]:
df[0:20]

In [13]:
import numpy as np
item = df.iloc[2]['NO2']

In [17]:
np.isnan(item)

In [5]:
mask = (df['datetime'] >= datetime(year=2014, month=5, day=1)) & (df['datetime'] <= datetime(year=2015, month=4, day=30))

In [6]:
brits_df = df[mask]

In [7]:
training_masks = [
            (df['datetime'] >= datetime(year=2014, month=5, day=1)) & (
                    df['datetime'] < datetime(year=2014, month=6, day=1)),
            (df['datetime'] >= datetime(year=2014, month=7, day=1)) & (
                    df['datetime'] < datetime(year=2014, month=9, day=1)),
            (df['datetime'] >= datetime(year=2014, month=10, day=1)) & (
                    df['datetime'] < datetime(year=2014, month=12, day=1)),
            (df['datetime'] >= datetime(year=2015, month=1, day=1)) & (
                    df['datetime'] < datetime(year=2015, month=3, day=1)),
            (df['datetime'] >= datetime(year=2015, month=4, day=1)) & (
                    df['datetime'] <= datetime(year=2015, month=4, day=30))
        ]
test_masks = [
            (df['datetime'] >= datetime(year=2014, month=6, day=1)) & (
                    df['datetime'] < datetime(year=2014, month=7, day=1)),
            (df['datetime'] >= datetime(year=2014, month=9, day=1)) & (
                    df['datetime'] < datetime(year=2014, month=10, day=1)),
            (df['datetime'] >= datetime(year=2014, month=12, day=1)) & (
                    df['datetime'] < datetime(year=2015, month=1, day=1)),
            (df['datetime'] >= datetime(year=2015, month=3, day=1)) & (
                    df['datetime'] < datetime(year=2015, month=4, day=1))
        ]

In [8]:
training_df = []
for mask in training_masks:
    training_df.append(df[mask])
training_df = pd.concat(training_df)
training_df.info()

In [9]:
training_df.isna().sum()

In [10]:
test_df = []
for mask in test_masks:
    test_df.append(df[mask])
test_df = pd.concat(test_df)
test_df.info()

In [11]:
test_df.isna().sum()

In [12]:
from torch_geometric.data import Data

In [13]:
example_train = df[training_masks[0]]

In [14]:
example_train.iloc[0]

In [15]:
batch_size = 5

In [16]:
example_train.iloc[0:5]

In [17]:
x = []  # [num_nodes, num_node_features]
edge_index = []  # Graph connectivity in COO format with shape [2, num_edges] and type torch.long
y = []  # Target to train against


In [76]:
import pydantic
from datetime import datetime
import torch
from torch import FloatTensor, LongTensor, BoolTensor
from pydantic import BaseModel
from typing import Optional

class TargetNode(BaseModel):
    features: FloatTensor
    type_index: LongTensor
    time: FloatTensor
    spatial_index: LongTensor

    class Config:
        arbitrary_types_allowed = True

class ContinuousTimeGraphSample(BaseModel):
    node_features: FloatTensor
    edge_index: Optional[LongTensor]
    attention_mask: Optional[BoolTensor]
    time: FloatTensor
    target: TargetNode
    type_index: LongTensor
    spatial_index: LongTensor
    category_index: Optional[LongTensor]

    class Config:
        arbitrary_types_allowed = True


In [19]:
pd.unique(df['wd'])

In [20]:
pd.unique(df['station'])

In [67]:
import random
target_index = random.randint(12*11*2+1, 12*11*3-1)
print(target_index)
set = target_index//11
print(set)
sample = example_train.iloc[set]
print(sample[features[target_index - set*11]])
sample

In [77]:

features = ['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3', 'TEMP', 'PRES', 'DEWP', 'RAIN', 'WSPM']
time = 'datetime'
category = 'wd'
spatial = 'station'
unique_wd = ['NNW', 'E', 'NW', 'WNW', 'N', 'ENE', 'NNE', 'W', 'NE', 'SSW', 'ESE', 'SE', 'S', 'SSE', 'SW', 'WSW', 'None']
unique_stations = ['Aotizhongxin', 'Changping', 'Dingling', 'Dongsi', 'Guanyuan', 'Gucheng', 'Huairou', 'Nongzhanguan', 'Shunyi', 'Tiantan', 'Wanliu', 'Wanshouxigong']
node_features = []
time_index = []
type_index = []
spatial_index = []
category_index = []
time_start = None
target = {}

for i in range(12*3):
    sample = example_train.iloc[i]
    if time_start is None:
        time_start = example_train.iloc[12*3-1][time]
    tau = (time_start - sample[time]).seconds
    for j, key in enumerate(features):
        if i*11 + j != target_index:
            type_index.append(j)
            node_features.append(sample[key])
            spatial_index.append(unique_stations.index(sample[spatial]))
            category_index.append(unique_wd.index(sample[category]))
            time_index.append(tau)
        else:
            target['features'] = torch.tensor([sample[key]], dtype=torch.float)
            target["type_index"] = torch.tensor([j], dtype=torch.long)
            target["spatial_index"] = torch.tensor([unique_stations.index(sample[spatial])], dtype=torch.long)
            target["category_index"] = torch.tensor([unique_wd.index(sample[category])], dtype=torch.long)
            target["time"] = torch.tensor([tau], dtype=torch.float)
print(target)

In [78]:
graph = {
    "node_features": torch.tensor(node_features, dtype=torch.float),
    'time': torch.tensor(time_index, dtype=torch.float),
    "type_index": torch.tensor(type_index, dtype=torch.long),
    "spatial_index": torch.tensor(spatial_index, dtype=torch.long),
    "category_index": torch.tensor(category_index, dtype=torch.long),
    "target": TargetNode(**target)
}
graph['attention_mask'] = graph['time'].unsqueeze(-1).T < graph['time'].unsqueeze(-1)


In [80]:
torch.save(ContinuousTimeGraphSample(**graph), "../../AGG/test_data/sample.pt")

In [64]:
now = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
print(len(now))

In [51]:
time = torch.tensor(time_index)

In [57]:
time2 = time.unsqueeze(-1).T >= time.unsqueeze(-1)
time3 = time.unsqueeze(-1).T < time.unsqueeze(-1)

In [58]:
time3

In [53]:
time2.float()

In [60]:
data = [0, 0, 10, 10, 20, 20]
data.reverse()
time_test = torch.tensor(data)

In [61]:
data

In [63]:
(time_test.unsqueeze(-1).T < time_test.unsqueeze(-1))