In [4]:
import pandas as pd
import networkx as nx
import numpy as np
import folium
from tqdm import tqdm
import pickle
from scipy import sparse
import matplotlib.pyplot as plt
import torch
from torch_geometric import utils, data

def haversine_start(df, car1, car2, max_dist = 1500):
    def _edge_weight(x, max_dist):
        return max((max_dist-x)/max_dist,0)
    dfc1 = df[df.car == car1]
    dfc2 = df[df.car == car2]

    point1 = dfc1[['leave_location_lat','leave_location_long']].values[0]
    point2 = dfc2[['leave_location_lat','leave_location_long']].values[0]
    #return point1

    # convert decimal degrees to radians
    lat1, lon1 = map(np.radians, point1)
    lat2, lon2 = map(np.radians, point2)

    # Deltas
    delta_lon = lon2 - lon1 
    delta_lat = lat2 - lat1 
    
    # haversine formula 
    a = np.sin(delta_lat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(delta_lon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371000 # Radius of earth in m
    return _edge_weight(c * r, max_dist)

def haversine_add(current_locs, to_add, max_dist = 1500):
    def _edge_weight(x, max_dist):
        return max((max_dist-x)/max_dist,0)
    def _haversine(point1, lat2, lon2):
        
         # convert decimal degrees to radians
        lat1, lon1 = map(np.radians, point1)

        # Deltas
        delta_lon = lon2 - lon1 
        delta_lat = lat2 - lat1 
        
        # haversine formula 
        a = np.sin(delta_lat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(delta_lon/2)**2
        c = 2 * np.arcsin(np.sqrt(a)) 
        r = 6371000 # Radius of earth in m
        return c * r

    leave_lat_add, leave_long_add = to_add.leave_location_lat, to_add.leave_location_long

    lat2, lon2 = map(np.radians, [leave_lat_add,leave_long_add])

    new_weights = [_edge_weight(_haversine(loc, lat2, lon2), max_dist) for loc in current_locs]

    return new_weights


def delete_rc(mat, i):
    # row
    n = mat.indptr[i+1] - mat.indptr[i]
    if n > 0:
        mat.data[mat.indptr[i]:-n] = mat.data[mat.indptr[i+1]:]
        mat.data = mat.data[:-n]
        mat.indices[mat.indptr[i]:-n] = mat.indices[mat.indptr[i+1]:]
        mat.indices = mat.indices[:-n]
    mat.indptr[i:-1] = mat.indptr[i+1:]
    mat.indptr[i:] -= n
    mat.indptr = mat.indptr[:-1]
    mat._shape = (mat._shape[0]-1, mat._shape[1])

    # col
    mat = mat.tocsc()
    n = mat.indptr[i+1] - mat.indptr[i]
    if n > 0:
        mat.data[mat.indptr[i]:-n] = mat.data[mat.indptr[i+1]:]
        mat.data = mat.data[:-n]
        mat.indices[mat.indptr[i]:-n] = mat.indices[mat.indptr[i+1]:]
        mat.indices = mat.indices[:-n]
    mat.indptr[i:-1] = mat.indptr[i+1:]
    mat.indptr[i:] -= n
    mat.indptr = mat.indptr[:-1]
    mat._shape = (mat._shape[0], mat._shape[1]-1)

    return mat.tocsr()

OSError: dlopen(/opt/anaconda3/envs/GNN_env/lib/python3.9/site-packages/torch_sparse/_convert_cpu.so, 6): Symbol not found: __ZN2at5emptyEN3c108ArrayRefIxEENS0_13TensorOptionsENS0_8optionalINS0_12MemoryFormatEEE
  Referenced from: /opt/anaconda3/envs/GNN_env/lib/python3.9/site-packages/torch_sparse/_convert_cpu.so
  Expected in: /opt/anaconda3/envs/GNN_env/lib/python3.9/site-packages/torch/lib/libtorch_cpu.dylib
 in /opt/anaconda3/envs/GNN_env/lib/python3.9/site-packages/torch_sparse/_convert_cpu.so

In [2]:
df = pd.read_csv('data/processed/VacancySplit.csv', index_col=0, parse_dates = [2]).astype({'time_to_reservation': 'float32', 'park_location_lat': 'float32', 'park_location_long': 'float32', 'leave_location_lat': 'float32', 'leave_location_long': 'float32', 'park_zone': 'int32', 'leave_zone': 'int32', 'park_fuel': 'int8', 'leave_fuel': 'int8', 'moved': 'float32', 'movedTF': 'bool'})
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3401726 entries, 0 to 3401725
Data columns (total 17 columns):
 #   Column               Dtype         
---  ------               -----         
 0   car                  object        
 1   time                 datetime64[ns]
 2   time_to_reservation  float32       
 3   park_location_lat    float32       
 4   park_location_long   float32       
 5   leave_location_lat   float32       
 6   leave_location_long  float32       
 7   park_zone            int32         
 8   leave_zone           int32         
 9   park_fuel            int8          
 10  leave_fuel           int8          
 11  engine               object        
 12  moved                float32       
 13  prev_customer        bool          
 14  next_customer        bool          
 15  movedTF              bool          
 16  action               bool          
dtypes: bool(4), datetime64[ns](1), float32(6), int32(2), int8(2), object(2)
memory usage: 227.1+ MB


In [3]:
Start_Time = pd.Timestamp('2018-07-09 14:27:00')
start_df = df[df.time <= Start_Time]
propegate_df = df[df.time > Start_Time]

# Start
CarID_dict_start = dict(iter(start_df.groupby('car')))
Start_Garph_data = []

for sub_df in CarID_dict_start.values():
    last_obs = sub_df.iloc[-1]
    if last_obs.action: # True is park
        Start_Garph_data.append(last_obs)

start_df_graph = pd.DataFrame(Start_Garph_data).iloc[:,:-1]

In [4]:
max_dist = 1500
def _edge_weight(x, max_dist):
        return max((max_dist-x)/max_dist,0)
A = pd.DataFrame(data = [[haversine_start(start_df_graph, car1, car2) for car1 in start_df_graph.car] for car2 in tqdm(start_df_graph.car)], index = start_df_graph.car, columns=start_df_graph.car, dtype='float16')

# And make it sparse
As = sparse.csr_matrix(A.values)

100%|██████████| 431/431 [04:33<00:00,  1.57it/s]


## Populate

In [5]:
As = sparse.csr_matrix(A.values)
Graph_dict = {pd.Timestamp('2018-07-09 14:27:00'): (start_df_graph ,As)}
node_data = start_df_graph.set_index('car')

In [6]:
positive_time = propegate_df[propegate_df.action].time.diff().shift(-1) > pd.Timedelta(0,'s')
positive_time[-1] = True

new_day = (propegate_df.time.dt.date.diff().shift(-1) > pd.Timedelta(0,'s'))
new_day[-1] = True

i = 0
for idx, next_row in tqdm(propegate_df.iterrows(), total = propegate_df.shape[0]):
    if next_row.action: # True is park
        # Get current locs
        locs = [[attr['leave_location_lat'], attr['leave_location_long']] for _, attr in node_data.iterrows()]
        
        # Add to node data
        node_data = node_data.append(next_row.rename(index = next_row['car']).iloc[1:16], verify_integrity = True)

        # Calculate new weights
        new_weights = haversine_add(locs, next_row, max_dist = 1500)

        # Add new weights to adjacency
        As = sparse.hstack([sparse.vstack([As,sparse.csr_matrix(new_weights)]).tocsc(), sparse.csc_matrix(new_weights+[1]).T]).tocsr()

    else:
        # Getindex
        idx_to_drop = np.where(node_data.index == next_row.car)[0][0]

        # Drop it
        As = delete_rc(As, idx_to_drop)

        # Drop from feature-matrix
        node_data.drop(index = next_row.car, inplace=True)
        
    # Save graph if new time 
    if positive_time.get(idx):
        Graph_dict[next_row.time] = (node_data.copy(), As.copy())

    # Save file every day on last obs
    if new_day[idx]:
        f_name = next_row.time.strftime('%Y%m%d')+'.pickle'
        with open(f_name, 'wb') as handle:
            pickle.dump(Graph_dict, handle, pickle.HIGHEST_PROTOCOL)

        # Clear memory
        Graph_dict = {}


    i += 1

    if i == 7000:
        break

  0%|          | 6999/2800087 [02:05<13:55:48, 55.70it/s]


## To PTG

In [4]:
park_zones = propegate_df.park_zone.unique()
leave_zones = propegate_df.leave_zone.unique()

In [49]:
with open('data/processed/zones.npy', 'wb') as f:
    np.save(f, propegate_df.park_zone.unique())
    np.save(f, propegate_df.leave_zone.unique())

In [43]:
def make_PTG(graph, park_zones, leave_zones):
    attr, adj = graph

    # Slice
    _, labels = connected_components(csgraph=adj, directed=False, return_labels=True)
    newl = labels[-1]
    indices = labels == newl   

    attr = attr[indices]
    adj = adj[indices,:].tocsc()[:,indices].tocsr()

    # Binarize
    attr[["prev_customer", "next_customer"]] = attr[["prev_customer", "next_customer"]].astype(int)

    # One hot encoding
    attr['park_zone'] = pd.Categorical(attr['park_zone'], categories=park_zones)
    attr = pd.get_dummies(attr, columns= ['park_zone'], prefix='pz')

    attr['leave_zone'] = pd.Categorical(attr['leave_zone'], categories=leave_zones)
    attr = pd.get_dummies(attr, columns = ['leave_zone'], prefix='lz')

    attr['engine']= pd.Categorical(attr['engine'], categories=['118I', 'I3', 'COOPER', 'X1'])
    attr = pd.get_dummies(attr, columns = ['engine'], prefix='eng')

    # Get edges
    edge_index, edge_weight = utils.convert.from_scipy_sparse_matrix(adj)

    # Make pytorch data type
    d = data.Data(x = torch.tensor(attr.filter(regex = 'park_fuel|pz|lz|eng').values), edge_index=edge_index, edge_attr=edge_weight, y = torch.tensor(attr.time_to_reservation.values))

    return d

In [10]:
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader
loader = DataLoader([make_PTG(g,park_zones,leave_zones) for g in tqdm(tenth.values())], batch_size=32, shuffle=False)

100%|██████████| 1900/1900 [00:34<00:00, 55.63it/s]


In [31]:
next(iter(loader)).ptr

tensor([    0,   441,   883,  1324,  1764,  2205,  2646,  3088,  3530,  3972,
         4415,  4859,  5302,  5745,  6189,  6633,  7073,  7514,  7954,  8394,
         8834,  9274,  9715, 10155, 10594, 11034, 11474, 11915, 12357, 12800,
        13244, 13688, 14130])

In [127]:
from torch_geometric.nn import GCNConv, TransformerConv, BatchNorm, SAGEConv, GNNExplainer
import torch.nn.functional as F
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

class GCN(torch.nn.Module):
    def __init__(self, num_features, hidden_channels, ):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.batch0 = BatchNorm(num_features)

        self.conv1 = GCNConv(num_features, hidden_channels)
        self.batch1 = BatchNorm(hidden_channels)

        self.conv2 = GCNConv(hidden_channels, int(hidden_channels))
        self.batch2 = BatchNorm(int(hidden_channels))

        self.conv3 = GCNConv(int(hidden_channels), int(hidden_channels))
        self.batch3 = BatchNorm(int(hidden_channels))
        
        self.conv4 = GCNConv(int(hidden_channels), 1)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.batch1(x)
        x = F.dropout(x, p=0.5, training=self.training)

        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.batch2(x)
        x = F.dropout(x, p=0.5, training=self.training)

        x = self.conv3(x, edge_index)
        x = x.relu()
        x = self.batch3(x)
        x = F.dropout(x, p=0.5, training=self.training)
  
        x = self.conv4(x, edge_index)
        return x

model = GCN(num_features=366, hidden_channels=512).to(device)
#d.to(device)
print(model)

GCN(
  (batch0): BatchNorm(366)
  (conv1): GCNConv(366, 512)
  (batch1): BatchNorm(512)
  (conv2): GCNConv(512, 512)
  (batch2): BatchNorm(512)
  (conv3): GCNConv(512, 512)
  (batch3): BatchNorm(512)
  (conv4): GCNConv(512, 1)
)


In [128]:
sum(p.numel() for p in model.parameters())

717533

In [None]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, weight_decay=5e-4, nesterov=True, momentum=0.9)
criterion = torch.nn.MSELoss()

train_loss_list = []
valid_loss_list = []

def train():
  model.train()
  optimizer.zero_grad()  # Clear gradients.
  out = model(data.x.to(torch.float), data.edge_index)  # Perform a single forward pass.
  loss = criterion(out[data.train_mask], data.y[data.train_mask].to(torch.float))  # Compute the loss solely based on the training nodes.
  loss.backward()  # Derive gradients.
  optimizer.step()  # Update parameters based on gradients.
  return loss

def valid():
  model.eval()
  out = model(data.x.to(torch.float), data.edge_index) 
  loss = criterion(out[data.val_mask], data.y[data.val_mask].to(torch.float))
  return loss

for epoch in range(0, 2501):
  train_loss = train().detach().cpu()
  valid_loss = valid().detach().cpu()
  train_loss_list.append(train_loss)
  valid_loss_list.append(valid_loss)
  if epoch % 100 == 0:
      print(f'Epoch: {epoch:03d}, Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}')

In [36]:
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader

dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES', use_node_attr=True)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [140]:
type(dataset)

torch_geometric.datasets.tu_dataset.TUDataset