In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import glob
from tqdm import tqdm
import pickle
from scipy import sparse
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
from datetime import date, timedelta
import torch.optim as optim
import torch.nn as nn
from torch_geometric.nn import GCNConv, MessagePassing, Linear
from torch_geometric import utils, data
from torch_geometric.loader import DataLoader
from sklearn.metrics import r2_score
pd.set_option('mode.chained_assignment',None)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
class NNNConv(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super().__init__(aggr='add') #  "Max" aggregation.
        self.mlp = nn.Sequential(nn.Linear(2 * in_channels, out_channels),
                       nn.ReLU(),
                       nn.Linear(out_channels, out_channels))

    def forward(self, x, edge_index, edge_weight):
        return self.propagate(edge_index, x=x, edge_weight=edge_weight)

    def message(self, x_i, x_j, edge_weight):
        tmp = torch.cat([x_i, x_j], dim=1) 
        return edge_weight.view(-1, 1) * self.mlp(tmp)

In [3]:
zones = [int(z[3:]) for z in pd.read_csv('data/processed/SimpleNNData.csv', index_col=0).filter(regex = 'lz').columns]
def make_PTG(graph, zones):
    attr, adj = graph

    # Filter out 
    if (attr.time_to_reservation.values[-1] >= 48) or ~attr.next_customer[-1]:
        return None
    
    if attr.leave_zone[-1] not in zones:
        return None

    # Slice
    _, labels = sparse.csgraph.connected_components(csgraph=adj, directed=False, return_labels=True)
    newl = labels[-1]
    indices = labels == newl   

    attr = attr[indices]
    adj = adj[indices,:].tocsc()[:,indices].tocsr()

    # Time variables
    attr['weekend'] = attr.time.dt.weekday//5

    def circle_transform(col, max_val=86400):
        tot_sec = ((col - col.dt.normalize()) / pd.Timedelta('1 second')).astype(int)
        cos_val = np.cos(2*np.pi*tot_sec/max_val)
        sin_val = np.sin(2*np.pi*tot_sec/max_val)
        return cos_val, sin_val

    attr['Time_Cos'], attr['Time_Sin'] = [x.values for x in circle_transform(attr.time)]

    # drop
    attr.drop(columns=['park_location_lat', 'park_location_long', 'leave_location_lat', 'leave_location_long', 'park_fuel', 'park_zone', 'moved', 'movedTF', 'time', 'prev_customer', 'next_customer'], inplace = True)

    # One hot encoding
    attr['leave_zone'] = pd.Categorical(attr['leave_zone'], categories=zones)
    attr = pd.get_dummies(attr, columns = ['leave_zone'], prefix='lz')

    attr['engine']= pd.Categorical(attr['engine'], categories=['118I', 'I3', 'COOPER', 'X1'])
    attr = pd.get_dummies(attr, columns = ['engine'], prefix='eng')

    # Normalize fuel and dist 
    attr['leave_fuel'] = attr['leave_fuel']/100
    #df['dist_to_station'] = df['dist_to_station']/5320

    # Get edges
    edge_index, edge_weight = utils.convert.from_scipy_sparse_matrix(adj)

    # Make pytorch data type
    d = data.Data(x = torch.tensor(attr.drop(columns = ['time_to_reservation']).to_numpy(dtype = 'float')).float(), edge_index=edge_index, edge_attr=edge_weight.float(), y = torch.tensor(attr.time_to_reservation.values).float())

    return d

In [4]:
sdate = date(2019, 9, 15)   # start date
edate = date(2019, 9, 17)   # end date
delta = edate - sdate       # as timedelta
files = ['data/processed/Graphs/'+(sdate + timedelta(days=i)).strftime("%Y%m%d")+'.pickle' for i in range(delta.days + 1)]

dataset = []

with open(files[0], 'rb') as f:
    graph_collection = pickle.load(f)

for g in graph_collection.values():
    res = make_PTG(g,zones)
    if res:
        dataset.append(res)

train_val_size = int(0.8 * len(dataset))
val_test_size = len(dataset)-train_val_size
train_val_data, test_data = torch.utils.data.random_split(dataset, [train_val_size, val_test_size])
train_size = train_val_size-val_test_size
train_data, val_data = torch.utils.data.random_split(train_val_data, [train_size, val_test_size])
del train_val_data

for file in tqdm(files[1:]):
    dataset = []
    with open(file, 'rb') as f:
        graph_collection = pickle.load(f)

    for g in graph_collection.values():
        res = make_PTG(g,zones)
        if res:
            dataset.append(res)

    train_val_size = int(0.8 * len(dataset))
    val_test_size = len(dataset)-train_val_size
    train_val_data_tmp, test_data_tmp = torch.utils.data.random_split(dataset, [train_val_size, val_test_size])
    train_size = train_val_size-val_test_size
    train_data_tmp, val_data_tmp = torch.utils.data.random_split(train_val_data_tmp, [train_size, val_test_size])

    train_data = torch.utils.data.ConcatDataset([train_data,train_data_tmp])
    val_data = torch.utils.data.ConcatDataset([val_data,val_data_tmp])
    test_data = torch.utils.data.ConcatDataset([test_data,test_data_tmp])

#del train_val_data_tmp, test_data_tmp, train_data_tmp, val_data_tmp, dataset

100%|██████████| 2/2 [00:54<00:00, 27.38s/it]


In [5]:
train_val_size = int(0.8 * len(dataset))
val_test_size = len(dataset)-train_val_size
train_val_data, test_data = torch.utils.data.random_split(dataset, [train_val_size, val_test_size])
train_size = train_val_size-val_test_size
train_data, val_data = torch.utils.data.random_split(train_val_data, [train_size, val_test_size])
del dataset

In [6]:
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=1, shuffle=False)

In [7]:
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = NNNConv(264, 64)
        self.lin = Linear(64,1)

    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr

        x = self.conv1(x, edge_index, edge_weight)
        x = F.relu(x)
        #x = F.dropout(x, p = 0.2, training=self.training)
        x = self.lin(x)

        return x.squeeze()

GNN = GCN().to(device)
print(GNN, sum(p.numel() for p in GNN.parameters()))

GCN(
  (conv1): NNNConv(
    (mlp): Sequential(
      (0): Linear(in_features=528, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
    )
  )
  (lin): Linear(64, 1, bias=True)
) 38081


In [8]:
optimizer = optim.Adam(GNN.parameters(), lr=0.03, weight_decay = 0.0001) #Chaged to Adam and learning + regulariztion rate set
criterion = nn.MSELoss(reduction = 'mean')

In [10]:
GNN.state_dict()

OrderedDict([('conv1.mlp.0.weight',
              tensor([[ 0.0207, -0.0294, -0.0187,  ...,  0.0049,  0.0378, -0.0235],
                      [-0.0078, -0.0161, -0.0092,  ..., -0.0219, -0.0432, -0.0143],
                      [-0.0131,  0.0054, -0.0292,  ...,  0.0306,  0.0294,  0.0428],
                      ...,
                      [-0.0030,  0.0230, -0.0241,  ...,  0.0142, -0.0045, -0.0234],
                      [-0.0102, -0.0149,  0.0235,  ...,  0.0073, -0.0253, -0.0353],
                      [ 0.0205, -0.0337, -0.0312,  ...,  0.0382,  0.0267, -0.0306]])),
             ('conv1.mlp.0.bias',
              tensor([-0.0052,  0.0372, -0.0141, -0.0100,  0.0143, -0.0321,  0.0096,  0.0282,
                       0.0086, -0.0050,  0.0208, -0.0098,  0.0402, -0.0398,  0.0303, -0.0351,
                      -0.0363,  0.0397,  0.0168, -0.0134,  0.0371, -0.0336, -0.0087, -0.0318,
                      -0.0031, -0.0183,  0.0081,  0.0331,  0.0264,  0.0333, -0.0200,  0.0218,
                    

In [40]:
# Set number of epochs
num_epochs = 3

# Set up lists for loss/R2
train_r2, train_loss = [], []
valid_r2, valid_loss = [], []
cur_loss = 0
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    ### Train
    cur_loss_train = []
    GNN.train()
    for batch in train_loader:
        optimizer.zero_grad()
        out = GNN(batch)
        batch_loss = criterion(out[batch.ptr[1:]-1],batch.y[batch.ptr[1:]-1])
        batch_loss.backward()
        optimizer.step()

        cur_loss_train.append(batch_loss.item())
    
    train_losses.append(np.mean(cur_loss_train))

    ### Evaluate training
    GNN.eval()
    train_preds, train_targs = [], []
    for batch in train_loader:
        preds = GNN(batch)
        train_targs += list(batch.y.numpy()[batch.ptr[1:]-1])
        train_preds += list(preds.detach().numpy()[batch.ptr[1:]-1])


    ### Evaluate validation
    val_preds, val_targs = [], []
    cur_loss_val = []
    for batch in val_loader:
        preds = GNN(batch)[batch.ptr[1:]-1]
        y_val = batch.y[batch.ptr[1:]-1]
        val_targs += list(y_val.numpy())
        val_preds += list(preds.detach().numpy())
        cur_loss_val.append(criterion(preds, y_val).item())

    val_losses.append(np.mean(cur_loss_val))


    train_r2_cur = r2_score(train_targs, train_preds)
    valid_r2_cur = r2_score(val_targs, val_preds)
    
    train_r2.append(train_r2_cur)
    valid_r2.append(valid_r2_cur)

    print("Epoch %2i: Train Loss %f , Valid Loss %f ,Train R2 %f, Valid R2 %f" % (
                epoch+1, train_losses[-1], val_losses[-1],train_r2_cur, valid_r2_cur))

Epoch  1: Train Loss 22.369354 , Valid Loss 29.154264 ,Train R2 0.180848, Valid R2 0.072168
Epoch  2: Train Loss 18.299884 , Valid Loss 28.489233 ,Train R2 0.348515, Valid R2 0.101050
Epoch  3: Train Loss 15.703176 , Valid Loss 28.154302 ,Train R2 0.340748, Valid R2 0.106605


In [105]:
def haversine_start(df, car1, car2, max_dist = 1500):
    def _edge_weight(x, max_dist):
        return max((max_dist-x)/max_dist,0)
    dfc1 = df[df.car == car1]
    dfc2 = df[df.car == car2]

    point1 = dfc1[['leave_location_lat','leave_location_long']].values[0]
    point2 = dfc2[['leave_location_lat','leave_location_long']].values[0]
    #return point1

    # convert decimal degrees to radians
    lat1, lon1 = map(np.radians, point1)
    lat2, lon2 = map(np.radians, point2)

    # Deltas
    delta_lon = lon2 - lon1 
    delta_lat = lat2 - lat1 
    
    # haversine formula 
    a = np.sin(delta_lat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(delta_lon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371000 # Radius of earth in m
    return _edge_weight(c * r, max_dist)

def haversine_add(current_locs, to_add, max_dist = 1500):
    def _edge_weight(x, max_dist):
        return max((max_dist-x)/max_dist,0)
    def _haversine(point1, lat2, lon2):
        
         # convert decimal degrees to radians
        lat1, lon1 = map(np.radians, point1)

        # Deltas
        delta_lon = lon2 - lon1 
        delta_lat = lat2 - lat1 
        
        # haversine formula 
        a = np.sin(delta_lat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(delta_lon/2)**2
        c = 2 * np.arcsin(np.sqrt(a)) 
        r = 6371000 # Radius of earth in m
        return c * r

    leave_lat_add, leave_long_add = to_add.leave_location_lat, to_add.leave_location_long

    lat2, lon2 = map(np.radians, [leave_lat_add,leave_long_add])

    new_weights = [_edge_weight(_haversine(loc, lat2, lon2), max_dist) for loc in current_locs]

    return new_weights

def delete_rc(mat, i):
    # row
    n = mat.indptr[i+1] - mat.indptr[i]
    if n > 0:
        mat.data[mat.indptr[i]:-n] = mat.data[mat.indptr[i+1]:]
        mat.data = mat.data[:-n]
        mat.indices[mat.indptr[i]:-n] = mat.indices[mat.indptr[i+1]:]
        mat.indices = mat.indices[:-n]
    mat.indptr[i:-1] = mat.indptr[i+1:]
    mat.indptr[i:] -= n
    mat.indptr = mat.indptr[:-1]
    mat._shape = (mat._shape[0]-1, mat._shape[1])

    # col
    mat = mat.tocsc()
    n = mat.indptr[i+1] - mat.indptr[i]
    if n > 0:
        mat.data[mat.indptr[i]:-n] = mat.data[mat.indptr[i+1]:]
        mat.data = mat.data[:-n]
        mat.indices[mat.indptr[i]:-n] = mat.indices[mat.indptr[i+1]:]
        mat.indices = mat.indices[:-n]
    mat.indptr[i:-1] = mat.indptr[i+1:]
    mat.indptr[i:] -= n
    mat.indptr = mat.indptr[:-1]
    mat._shape = (mat._shape[0], mat._shape[1]-1)

    return mat.tocsr()

# Haversine function for stations
def haversine_station(point1, point2):
    # convert decimal degrees to radians
    lat1, lon1 = map(np.radians, point1)
    lat2, lon2 = map(np.radians, point2)

    # Deltas
    delta_lon = lon2 - lon1 
    delta_lat = lat2 - lat1 

    # haversine formula 
    a = np.sin(delta_lat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(delta_lon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371000 # Radius of earth in m
    return c * r

In [180]:
# Load data
df = pd.read_csv('data/processed/VacancySplit.csv', index_col=0, parse_dates = [2]).astype({'time_to_reservation': 'float32', 'park_location_lat': 'float32', 'park_location_long': 'float32', 'leave_location_lat': 'float32', 'leave_location_long': 'float32', 'park_zone': 'int32', 'leave_zone': 'int32', 'park_fuel': 'int8', 'leave_fuel': 'int8', 'moved': 'float32', 'movedTF': 'bool'})

# Time variables
df['weekend'] = df.time.dt.weekday//5

# Time encoding
def circle_transform(col, max_val=86400):
    tot_sec = ((col - col.dt.normalize()) / pd.Timedelta('1 second')).astype(int)
    cos_val = np.cos(2*np.pi*tot_sec/max_val)
    sin_val = np.sin(2*np.pi*tot_sec/max_val)
    return cos_val, sin_val

df['Time_Cos'], df['Time_Sin'] = [x.values for x in circle_transform(df.time)]

# Join weather
df_weather = pd.read_csv('data/processed/weather.csv', index_col=0, parse_dates=[0])


In [181]:
df['timeH'] = df.time.round('H')
df = df.set_index('timeH').join(df_weather).reset_index(drop=True)
len(df)

3401726

In [88]:
# Load data
df = pd.read_csv('data/processed/VacancySplit.csv', index_col=0, parse_dates = [2]).astype({'time_to_reservation': 'float32', 'park_location_lat': 'float32', 'park_location_long': 'float32', 'leave_location_lat': 'float32', 'leave_location_long': 'float32', 'park_zone': 'int32', 'leave_zone': 'int32', 'park_fuel': 'int8', 'leave_fuel': 'int8', 'moved': 'float32', 'movedTF': 'bool'})

# Time variables
df['weekend'] = df.time.dt.weekday//5

# Time encoding
def circle_transform(col, max_val=86400):
    tot_sec = ((col - col.dt.normalize()) / pd.Timedelta('1 second')).astype(int)
    cos_val = np.cos(2*np.pi*tot_sec/max_val)
    sin_val = np.sin(2*np.pi*tot_sec/max_val)
    return cos_val, sin_val

df['Time_Cos'], df['Time_Sin'] = [x.values for x in circle_transform(df.time)]

# Join weather
df_weather = pd.read_csv('data/processed/weather.csv', index_col=0, parse_dates=[0])
df['timeH'] = df.time.round('H')
df = df.set_index('timeH').join(df_weather).reset_index(drop=True)

# Create init graph
Start_Time = pd.Timestamp('2019-08-31 21:00:00')
End_Time = pd.Timestamp('2019-11-01 12:00:00')
start_df = df[df.time <= Start_Time]
propegate_df = df[(df.time > Start_Time) & (df.time <= End_Time)]

# Start
CarID_dict_start = dict(iter(start_df.groupby('car')))
Start_Garph_data = []

for sub_df in CarID_dict_start.values():
    last_obs = sub_df.iloc[-1]
    if last_obs.action: # True is park
        Start_Garph_data.append(last_obs)

start_df_graph = pd.DataFrame(Start_Garph_data).iloc[:,:-1]

In [118]:
Start_Garph_data

[car                      WBA1R5100J7B13378
 time                   2019-08-31 20:07:41
 time_to_reservation               3.598333
 park_location_lat                55.690353
 park_location_long               12.493163
 leave_location_lat               55.690353
 leave_location_long              12.493163
 park_zone                           102631
 leave_zone                          102631
 park_fuel                               96
 leave_fuel                              61
 engine                                118I
 moved                                  0.0
 prev_customer                         True
 next_customer                         True
 movedTF                              False
 action                                True
 weekend                                  1
 Time_Cos                          0.528747
 Time_Sin                         -0.848779
 mean_temp                             24.0
 mean_wind_speed                        3.3
 acc_precip                     

In [184]:
# Haversine function
def haversine(point1, point2):
    # convert decimal degrees to radians
    lat1, lon1 = map(np.radians, point1)
    lat2, lon2 = map(np.radians, point2)

    # Deltas
    delta_lon = lon2 - lon1 
    delta_lat = lat2 - lat1 

    # haversine formula 
    a = np.sin(delta_lat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(delta_lon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371000 # Radius of earth in m
    return c * r

# Add dist to station
with open('data/processed/Train_stations.pickle', 'rb') as handle:
    Stations = pickle.load(handle)


#start_df_graph['dist_to_station'] = [min({k:haversine(v,r[1].values) for k,v in Stations.items()}.values()) for r in start_df_graph[['park_location_lat',	'park_location_long']].iterrows()]
#propegate_df['dist_to_station'] = [min({k:haversine(v,r[1].values) for k,v in Stations.items()}.values()) for r in tqdm(propegate_df[['park_location_lat',	'park_location_long']].iterrows(), total = len(propegate_df))]

In [106]:
# Adj matrix
A = pd.DataFrame(data = [[haversine_start(start_df_graph, car1, car2) for car1 in start_df_graph.car] for car2 in tqdm(start_df_graph.car)], index = start_df_graph.car, columns=start_df_graph.car, dtype='float16')

# And make it sparse
As = sparse.csr_matrix(A.values)

# Populate
Graph_dict = {pd.Timestamp('2019-08-31 21:00:00'): (start_df_graph ,As)}
node_data = start_df_graph.set_index('car')

100%|██████████| 315/315 [02:59<00:00,  1.76it/s]


In [111]:
locs = [[attr['leave_location_lat'], attr['leave_location_long']] for _, attr in node_data.iterrows()]

In [117]:
start_df_graph

Unnamed: 0,car,time,time_to_reservation,park_location_lat,park_location_long,leave_location_lat,leave_location_long,park_zone,leave_zone,park_fuel,...,weekend,Time_Cos,Time_Sin,mean_temp,mean_wind_speed,acc_precip,bright_sunshine,mean_pressure,mean_relative_hum,dist_to_station
2737052,WBA1R5100J7B13378,2019-08-31 20:07:41,3.598333,55.690353,12.493163,55.690353,12.493163,102631,102631,96,...,1,0.528747,-0.848779,24.0,3.3,0.0,40.5,1012.3,72.9,362.850961
2735534,WBA1R5100K7D67688,2019-08-31 16:50:02,16.251944,55.716625,12.595335,55.716625,12.595335,102332,102332,84,...,1,-0.300567,-0.953761,26.1,4.2,0.0,58.2,1014.2,60.9,532.505078
2736688,WBA1R5101J5K57918,2019-08-31 19:12:46,2.161389,55.666649,12.554121,55.666649,12.554121,102812,102812,51,...,1,0.312197,-0.950017,25.1,4.4,0.0,57.2,1012.9,66.7,342.634371
2737288,WBA1R5101J5K57921,2019-08-31 20:54:27,1.548056,55.725277,12.575421,55.725277,12.575421,102311,102311,28,...,1,0.689778,-0.724021,23.2,2.5,0.0,0.0,1012.0,76.6,797.583821
2737298,WBA1R5101J5K58065,2019-08-31 20:56:23,0.907778,55.686661,12.532815,55.686661,12.532815,147161,147161,88,...,1,0.695861,-0.718177,23.2,2.5,0.0,0.0,1012.0,76.6,68.074730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2737259,WMWXU7109KTM90592,2019-08-31 20:48:33,1.893889,55.634796,12.621698,55.634796,12.621698,185132,185132,25,...,1,0.670912,-0.741537,23.2,2.5,0.0,0.0,1012.0,76.6,1248.688496
2737195,WMWXU7109KTM91323,2019-08-31 20:36:10,0.503889,55.670658,12.582522,55.670658,12.582522,103142,103142,85,...,1,0.629885,-0.776688,23.2,2.5,0.0,0.0,1012.0,76.6,575.759061
2737169,WMWXU710XKTM90603,2019-08-31 20:29:50,15.093333,55.705463,12.535020,55.705463,12.535020,102532,102532,75,...,1,0.608184,-0.793796,24.0,3.3,0.0,40.5,1012.3,72.9,397.887191
2737184,WMWXU710XKTM90617,2019-08-31 20:32:53,1.123333,55.679897,12.525328,55.679897,12.525328,147152,147152,90,...,1,0.618694,-0.785632,23.2,2.5,0.0,0.0,1012.0,76.6,248.411684


In [115]:
next_row.rename(index = next_row['car']).iloc[1:]

time                   2019-08-31 21:00:07
time_to_reservation               6.386111
park_location_lat                55.685123
park_location_long               12.539447
leave_location_lat               55.685123
leave_location_long              12.539447
park_zone                           147121
leave_zone                          147121
park_fuel                               56
leave_fuel                              55
engine                                  I3
moved                                  0.0
prev_customer                         True
next_customer                         True
movedTF                              False
action                                True
weekend                                  1
Time_Cos                          0.707467
Time_Sin                         -0.706747
mean_temp                             23.2
mean_wind_speed                        2.5
acc_precip                             0.0
bright_sunshine                        0.0
mean_pressu

In [40]:
for idx, next_row in tqdm(propegate_df.iterrows(), total = propegate_df.shape[0]):
    if next_row.action: # True is park
        # Get current locs
        locs = [[attr['leave_location_lat'], attr['leave_location_long']] for _, attr in node_data.iterrows()]
        
        # Add to node data
        node_data = node_data.append(next_row.rename(index = next_row['car']).iloc[1:27], verify_integrity = True)

        # Calculate new weights
        new_weights = haversine_add(locs, next_row, max_dist = 1500)

        # Add new weights to adjacency
        As = sparse.hstack([sparse.vstack([As,sparse.csr_matrix(new_weights)]).tocsc(), sparse.csc_matrix(new_weights+[1]).T]).tocsr()

    else:
        # Getindex
        idx_to_drop = np.where(node_data.index == next_row.car)[0][0]

        # Drop it
        As = delete_rc(As, idx_to_drop)

        # Drop from feature-matrix
        node_data.drop(index = next_row.car, inplace=True)
        
    # Save graph if new time 
    if positive_time.get(idx):
        Graph_dict[next_row.time] = (node_data.copy(), As.copy())

    # Save file every day on last obs
    if new_day[idx]:
        f_name = next_row.time.strftime('data/processed/Graphs/%Y%m%d')+'.pickle'
        with open(f_name, 'wb') as handle:
            pickle.dump(Graph_dict, handle, pickle.HIGHEST_PROTOCOL)

        # Clear memory
        Graph_dict = {}

In [102]:
idx, next_row = next(iter(propegate_df.iloc[2:].iterrows()))

In [103]:
next_row

car                      WBY8P2108K7D87711
time                   2019-08-31 21:00:07
time_to_reservation               6.386111
park_location_lat                55.685123
park_location_long               12.539447
leave_location_lat               55.685123
leave_location_long              12.539447
park_zone                           147121
leave_zone                          147121
park_fuel                               56
leave_fuel                              55
engine                                  I3
moved                                  0.0
prev_customer                         True
next_customer                         True
movedTF                              False
action                                True
weekend                                  1
Time_Cos                          0.707467
Time_Sin                         -0.706747
mean_temp                             23.2
mean_wind_speed                        2.5
acc_precip                             0.0
bright_suns

In [185]:
with open('data/processed/Graphs/20190904.pickle', 'rb') as handle:
    ttt = pickle.load(handle)

In [191]:
next(iter(ttt.values()))[0].columns

Index(['time', 'time_to_reservation', 'park_location_lat',
       'park_location_long', 'leave_location_lat', 'leave_location_long',
       'park_zone', 'leave_zone', 'park_fuel', 'leave_fuel', 'engine', 'moved',
       'prev_customer', 'next_customer', 'movedTF', 'action', 'weekend',
       'Time_Cos', 'Time_Sin', 'mean_temp', 'mean_wind_speed', 'acc_precip',
       'bright_sunshine', 'mean_pressure', 'mean_relative_hum',
       'mean_cloud_cover', 'dist_to_station'],
      dtype='object')

In [195]:
import pandas as pd
import glob
from tqdm import tqdm
import pickle

files = glob.glob("data/processed/Graphs/*")

In [196]:
cols = ['time', 'time_to_reservation', 'park_location_lat', 'park_location_long', 'leave_location_lat', 'leave_location_long','park_zone', 'leave_zone', 'park_fuel', 'leave_fuel', 'engine', 'moved','prev_customer', 'next_customer',
        'movedTF', 'action', 'weekend', 'Time_Cos', 'Time_Sin', 'mean_temp', 'mean_wind_speed', 'acc_precip', 'bright_sunshine', 'mean_pressure', 'mean_relative_hum', 'mean_cloud_cover', 'dist_to_station', 'degree']
dfn = pd.DataFrame(columns = cols)
i = 0

for f in tqdm(files):
    with open(f, 'rb') as day_file:
        day = pickle.load(day_file)

    for attr, adj in day.values():
        dfn.loc[i] = list(attr.iloc[-1].values)+[adj.getrow(-1).sum()]
        i += 1

100%|██████████| 61/61 [51:58<00:00, 51.13s/it]


In [207]:
df2 = dfn.sort_values(by='time').copy()

In [208]:
df2['timeh'] = df2.time.round('h').dt.hour
trafic_index = dict(df2.groupby('timeh').mean()['time_to_reservation'])
df2.drop(columns = ['timeh'], inplace=True)
df2['hour_index'] = df2.time.dt.hour.map(trafic_index)

In [209]:
# Only to customers
df2 = df2[df2.next_customer]
df2.drop(columns=['prev_customer', 'next_customer'], inplace = True)

# No more than 2 days
df2 = df2[df2.time_to_reservation < 48]

# Remove zones with too little support
df2 = df2[~df2.leave_zone.isin((df2.leave_zone.value_counts() < 30).index[df2.leave_zone.value_counts() < 30])]

# One hot encoding
df2 = pd.get_dummies(df2, columns = ['engine','leave_zone'], prefix=['eng','lz'])

# Drop those far away
df2 = df2[df2['dist_to_station'] <= 7000]

In [210]:
df2.drop(columns=['park_location_lat', 'park_location_long', 'leave_location_lat', 'leave_location_long', 'park_fuel', 'park_zone', 'moved', 'movedTF', 'action'], inplace = True)

In [211]:
df2

Unnamed: 0,time,time_to_reservation,leave_fuel,weekend,Time_Cos,Time_Sin,mean_temp,mean_wind_speed,acc_precip,bright_sunshine,...,lz_185122,lz_185123,lz_185124,lz_185131,lz_185132,lz_185141,lz_185142,lz_185143,lz_185154,lz_185203
117130,2019-09-01 00:00:18,10.836945,38,1,0.999999,0.001309,22.4,3.6,2.4,0.0,...,0,0,0,0,0,0,0,0,0,0
117131,2019-09-01 00:00:47,10.142222,38,1,0.999994,0.003418,22.4,3.6,2.4,0.0,...,0,0,0,0,0,0,0,0,0,0
117132,2019-09-01 00:01:17,0.511111,32,1,0.999984,0.005600,22.4,3.6,2.4,0.0,...,0,0,0,0,0,0,0,0,0,0
117133,2019-09-01 00:01:23,1.473056,65,1,0.999982,0.006036,22.4,3.6,2.4,0.0,...,0,0,0,0,0,0,0,0,0,0
117134,2019-09-01 00:02:18,0.001389,38,1,0.999950,0.010035,22.4,3.6,2.4,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2864,2019-10-31 23:53:28,2.346944,96,0,0.999594,-0.028503,6.8,2.2,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2865,2019-10-31 23:54:14,10.204445,46,0,0.999683,-0.025159,6.8,2.2,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2866,2019-10-31 23:54:24,2.144167,43,0,0.999701,-0.024432,6.8,2.2,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2867,2019-10-31 23:55:53,0.490278,67,0,0.999839,-0.017961,6.8,2.2,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
