In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import folium
from tqdm import tqdm
import pickle
from scipy import sparse
import matplotlib.pyplot as plt
import torch
from torch_geometric import utils, data

def haversine_start(df, car1, car2, max_dist = 1500):
    def _edge_weight(x, max_dist):
        return max((max_dist-x)/max_dist,0)
    dfc1 = df[df.car == car1]
    dfc2 = df[df.car == car2]

    point1 = dfc1[['leave_location_lat','leave_location_long']].values[0]
    point2 = dfc2[['leave_location_lat','leave_location_long']].values[0]
    #return point1

    # convert decimal degrees to radians
    lat1, lon1 = map(np.radians, point1)
    lat2, lon2 = map(np.radians, point2)

    # Deltas
    delta_lon = lon2 - lon1 
    delta_lat = lat2 - lat1 
    
    # haversine formula 
    a = np.sin(delta_lat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(delta_lon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371000 # Radius of earth in m
    return _edge_weight(c * r, max_dist)

def haversine_add(current_locs, to_add, max_dist = 1500):
    def _edge_weight(x, max_dist):
        return max((max_dist-x)/max_dist,0)
    def _haversine(point1, lat2, lon2):
        
         # convert decimal degrees to radians
        lat1, lon1 = map(np.radians, point1)

        # Deltas
        delta_lon = lon2 - lon1 
        delta_lat = lat2 - lat1 
        
        # haversine formula 
        a = np.sin(delta_lat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(delta_lon/2)**2
        c = 2 * np.arcsin(np.sqrt(a)) 
        r = 6371000 # Radius of earth in m
        return c * r

    leave_lat_add, leave_long_add = to_add.leave_location_lat, to_add.leave_location_long

    lat2, lon2 = map(np.radians, [leave_lat_add,leave_long_add])

    new_weights = [_edge_weight(_haversine(loc, lat2, lon2), max_dist) for loc in current_locs]

    return new_weights


def delete_rc(mat, i):
    # row
    n = mat.indptr[i+1] - mat.indptr[i]
    if n > 0:
        mat.data[mat.indptr[i]:-n] = mat.data[mat.indptr[i+1]:]
        mat.data = mat.data[:-n]
        mat.indices[mat.indptr[i]:-n] = mat.indices[mat.indptr[i+1]:]
        mat.indices = mat.indices[:-n]
    mat.indptr[i:-1] = mat.indptr[i+1:]
    mat.indptr[i:] -= n
    mat.indptr = mat.indptr[:-1]
    mat._shape = (mat._shape[0]-1, mat._shape[1])

    # col
    mat = mat.tocsc()
    n = mat.indptr[i+1] - mat.indptr[i]
    if n > 0:
        mat.data[mat.indptr[i]:-n] = mat.data[mat.indptr[i+1]:]
        mat.data = mat.data[:-n]
        mat.indices[mat.indptr[i]:-n] = mat.indices[mat.indptr[i+1]:]
        mat.indices = mat.indices[:-n]
    mat.indptr[i:-1] = mat.indptr[i+1:]
    mat.indptr[i:] -= n
    mat.indptr = mat.indptr[:-1]
    mat._shape = (mat._shape[0], mat._shape[1]-1)

    return mat.tocsr()

In [2]:
df = pd.read_csv('data/processed/VacancySplit.csv', index_col=0, parse_dates = [2]).astype({'time_to_reservation': 'float32', 'park_location_lat': 'float32', 'park_location_long': 'float32', 'leave_location_lat': 'float32', 'leave_location_long': 'float32', 'park_zone': 'int32', 'leave_zone': 'int32', 'park_fuel': 'int8', 'leave_fuel': 'int8', 'moved': 'float32', 'movedTF': 'bool'})
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3401786 entries, 0 to 3401785
Data columns (total 17 columns):
 #   Column               Dtype         
---  ------               -----         
 0   car                  object        
 1   time                 datetime64[ns]
 2   time_to_reservation  float32       
 3   park_location_lat    float32       
 4   park_location_long   float32       
 5   leave_location_lat   float32       
 6   leave_location_long  float32       
 7   park_zone            int32         
 8   leave_zone           int32         
 9   park_fuel            int8          
 10  leave_fuel           int8          
 11  engine               object        
 12  moved                float32       
 13  prev_customer        bool          
 14  next_customer        bool          
 15  movedTF              bool          
 16  action               bool          
dtypes: bool(4), datetime64[ns](1), float32(6), int32(2), int8(2), object(2)
memory usage: 227.1+ MB


In [3]:
Start_Time = pd.Timestamp('2018-07-09 14:27:00')
start_df = df[df.time <= Start_Time]
propegate_df = df[df.time > Start_Time]

# Start
CarID_dict_start = dict(iter(start_df.groupby('car')))
Start_Garph_data = []

for sub_df in CarID_dict_start.values():
    last_obs = sub_df.iloc[-1]
    if last_obs.action: # True is park
        Start_Garph_data.append(last_obs)

start_df_graph = pd.DataFrame(Start_Garph_data).iloc[:,:-1]

In [4]:
max_dist = 1500
def _edge_weight(x, max_dist):
        return max((max_dist-x)/max_dist,0)
A = pd.DataFrame(data = [[haversine_start(start_df_graph, car1, car2) for car1 in start_df_graph.car] for car2 in tqdm(start_df_graph.car)], index = start_df_graph.car, columns=start_df_graph.car, dtype='float16')

# And make it sparse
As = sparse.csr_matrix(A.values)

100%|██████████| 431/431 [06:05<00:00,  1.18it/s]


## Populate

In [5]:
propegate_df.iloc[60910:60920]

Unnamed: 0,car,time,time_to_reservation,park_location_lat,park_location_long,leave_location_lat,leave_location_long,park_zone,leave_zone,park_fuel,leave_fuel,engine,moved,prev_customer,next_customer,movedTF,action
662559,WBY1Z21040V307824,2018-07-26 14:35:59,1.638333,55.636917,12.654957,55.636917,12.654957,185121,185121,58,58,I3,0.0,True,True,False,True
662560,WMWXR3103KTK54014,2018-07-26 14:36:02,1.9175,55.697792,12.540138,55.697792,12.540138,102452,102452,60,60,COOPER,0.0,True,True,False,True
662561,WBY1Z21070V308224,2018-07-26 14:36:22,2.289444,55.656803,12.636684,55.656803,12.636684,103212,103212,61,61,I3,0.0,True,True,False,False
662562,WBA1R5107J5K58068,2018-07-26 14:36:27,1.254722,55.666653,12.506654,55.666653,12.506654,102731,102731,92,90,118I,0.0,True,True,False,True
662563,WBY1Z21010V307943,2018-07-26 14:36:27,3.909167,55.711037,12.55636,55.711037,12.55636,102413,102413,68,68,I3,0.0,True,True,False,True
662564,WBY1Z210X0V308184,2018-07-26 14:36:32,2.628056,55.679779,12.603418,55.679779,12.603418,103132,103132,30,30,I3,0.0,True,True,False,False
662565,WBY1Z21040V308097,2018-07-26 14:37:00,3.636111,55.65144,12.538199,55.65144,12.538199,102852,102852,79,79,I3,0.0,True,True,False,True
662566,WBA1R5101J5K57918,2018-07-26 14:37:10,0.502222,55.686848,12.54034,55.686848,12.54034,147161,147161,75,75,118I,0.0,True,True,False,False
662567,WBY1Z21010V308042,2018-07-26 14:37:25,0.404167,55.69519,12.555033,55.69519,12.555033,102443,102443,77,76,I3,0.0,True,True,False,True
662568,WBY1Z21060V308103,2018-07-26 14:37:27,5.01,55.698612,12.540441,55.698612,12.540441,102452,102452,99,100,I3,0.0,True,True,False,True


In [6]:
Graph_dict = {pd.Timestamp('2018-07-09 14:27:00'): (start_df_graph ,As)}
node_data = start_df_graph.set_index('car')

In [7]:
time_to_next = propegate_df.time.diff().shift(-1)
positive_time = (time_to_next > pd.Timedelta(0,'s'))

new_day = (propegate_df.time.dt.date.diff().shift(-1) > pd.Timedelta(0,'s'))

i = 0
for idx, next_row in tqdm(propegate_df.iterrows(), total = propegate_df.shape[0]):
    if next_row.action: # True is park
        # Get current locs
        locs = [[attr['leave_location_lat'], attr['leave_location_long']] for _, attr in node_data.iterrows()]
        
        # Add to node data
        node_data = node_data.append(next_row.rename(index = next_row['car']).iloc[1:16], verify_integrity = True)

        # Calculate new weights
        new_weights = haversine_add(locs, next_row, max_dist = 1500)

        # Add new weights to adjacency
        As = sparse.hstack([sparse.vstack([As,sparse.csr_matrix(new_weights)]).tocsc(), sparse.csc_matrix(new_weights+[1]).T]).tocsr()

    else:
        # Getindex
        idx_to_drop = np.where(node_data.index == next_row.car)[0][0]

        # Drop it
        As = delete_rc(As, idx_to_drop)

        # Drop from feature-matrix
        node_data.drop(index = next_row.car, inplace=True)
        
    # Save graph if new time 
    if positive_time[idx]:
        Graph_dict[next_row.time] = (node_data, As)

    # Save file every day on last obs
    if new_day[idx]:
        f_name = next_row.time.strftime('%Y%m%d')+'.pickle'
        with open(f_name, 'wb') as handle:
            pickle.dump(Graph_dict, handle, pickle.HIGHEST_PROTOCOL)

        # Clear memory
        Graph_dict = {}


    #i += 1

    #if i == 6100:
    #    break

  2%|▏         | 60920/2800137 [18:16<13:41:53, 55.55it/s]


IndexError: index 0 is out of bounds for axis 0 with size 0

In [30]:
sum(df[(df.car == 'WBY1Z21030V307832')].action.astype(int).diff().values[1:] == 0)

2

In [31]:
CarID_dict = dict(iter(df.groupby('car')))

tmp = pd.Series({car: sum(dataf.action.astype(int).diff().values[1:] == 0) for car,dataf in CarID_dict.items()})

In [36]:
tmp.sort_values(ascending=False).iloc[:30]

WBY1Z21030V308267    2
WMWXR3108KTK54607    2
WBY1Z21080V307857    2
WBA1R5104J5K58190    2
WBY1Z21080V307874    2
WBY1Z21080V307969    2
WBY1Z21090V307852    2
WBY1Z21000V308257    2
WBY1Z21090V307897    2
WBY8P2101K7D91101    2
WBY8P2100K7D77190    2
WBY1Z21030V308074    2
WBY1Z21030V307832    2
WBY1Z21060V308005    2
WBY1Z21010V308249    2
WMWXR3103KTK54014    2
WBY1Z21030V308205    2
WBY1Z210X0V307746    2
WBY1Z21090V308077    2
WBA1R5103J7B13388    2
WBY1Z21060V308067    2
WBY1Z21000V307884    2
WBY1Z21050V308030    2
WBY8P2107K7E73561    1
WBY8P2104K7D70324    0
WBY8P2103K7E72617    0
WBY8P2103K7E74366    0
WBY8P2104K7D73451    0
WBY8P2104K7D71294    0
WBY8P2104K7D72350    0
dtype: int64

In [37]:
CarID_dict['WBY8P2107K7E73561']

Unnamed: 0,car,time,time_to_reservation,park_location_lat,park_location_long,leave_location_lat,leave_location_long,park_zone,leave_zone,park_fuel,leave_fuel,engine,moved,prev_customer,next_customer,movedTF,action
3065889,WBY8P2107K7E73561,2019-10-31 11:29:55,5.358889,55.706799,12.580781,55.706799,12.580781,102343,102343,100,100,I3 120,0.0,False,True,False,True
3067568,WBY8P2107K7E73561,2019-10-31 16:51:27,5.358889,55.706799,12.580781,55.706799,12.580781,102343,102343,100,100,I3 120,0.0,False,True,False,False
3073887,WBY8P2107K7E73561,2019-11-01 17:01:09,0.013889,55.650612,12.549394,55.650612,12.549394,102851,102851,100,97,I3 120,0.0,True,True,False,True
3073894,WBY8P2107K7E73561,2019-11-01 17:01:59,0.013889,55.650612,12.549394,55.650612,12.549394,102851,102851,100,97,I3 120,0.0,True,True,False,False
3073998,WBY8P2107K7E73561,2019-11-01 17:14:24,0.852222,55.631989,12.579280,55.631989,12.579280,103291,103291,94,94,I3 120,0.0,True,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3286072,WBY8P2107K7E73561,2019-12-09 01:27:07,4.719722,55.721752,12.536680,55.721752,12.536680,102521,102521,73,73,I3 120,0.0,True,True,False,False
3286091,WBY8P2107K7E73561,2019-12-09 01:53:16,1.631111,55.732315,12.438447,55.732315,12.438447,163021,163021,54,54,I3 120,0.0,True,True,False,True
3286125,WBY8P2107K7E73561,2019-12-09 03:31:08,1.631111,55.732315,12.438447,55.732315,12.438447,163021,163021,54,54,I3 120,0.0,True,True,False,False
3292521,WBY8P2107K7E73561,2019-12-10 10:34:47,0.000000,55.732315,12.438447,55.732315,12.438447,163021,163021,27,27,I3 120,0.0,True,False,False,False


In [17]:
df[(df.car == 'WBY1Z21030V307832') & (df.time > pd.Timestamp('2018-07-23 14:37:33'))].iloc[:25]

Unnamed: 0,car,time,time_to_reservation,park_location_lat,park_location_long,leave_location_lat,leave_location_long,park_zone,leave_zone,park_fuel,leave_fuel,engine,moved,prev_customer,next_customer,movedTF,action
652425,WBY1Z21030V307832,2018-07-23 15:12:58,1.824167,55.71254,12.561591,55.71254,12.561591,102412,102412,45,44,I3,0.0,True,True,False,False
652461,WBY1Z21030V307832,2018-07-23 15:22:51,2.206667,55.722,12.54404,55.722,12.54404,102521,102521,40,40,I3,0.0,True,True,False,True
652993,WBY1Z21030V307832,2018-07-23 17:35:15,2.206667,55.722,12.54404,55.722,12.54404,102521,102521,40,40,I3,0.0,True,True,False,False
653108,WBY1Z21030V307832,2018-07-23 18:02:48,6.955,55.691692,12.46519,55.691692,12.46519,175032,175032,33,19,I3,0.0,True,True,False,True
654096,WBY1Z21030V307832,2018-07-24 01:00:06,6.955,55.691692,12.46519,55.691692,12.46519,175032,175032,33,19,I3,0.0,True,True,False,False
654104,WBY1Z21030V307832,2018-07-24 01:13:14,6.844444,55.677921,12.531002,55.677921,12.531002,147132,147132,100,100,I3,0.0,True,True,False,True
654472,WBY1Z21030V307832,2018-07-24 08:03:54,6.844444,55.677921,12.531002,55.677921,12.531002,147132,147132,100,100,I3,0.0,True,True,False,False
654543,WBY1Z21030V307832,2018-07-24 08:25:20,0.188889,55.684677,12.539448,55.684677,12.539448,147121,147121,100,100,I3,0.0,True,True,False,True
654587,WBY1Z21030V307832,2018-07-24 08:36:40,0.188889,55.684677,12.539448,55.684677,12.539448,147121,147121,100,100,I3,0.0,True,True,False,False
654669,WBY1Z21030V307832,2018-07-24 08:54:29,5.909444,55.670692,12.58224,55.670692,12.58224,103142,103142,97,97,I3,0.0,True,True,False,True


In [11]:
np.where(node_data.index == next_row.car)

(array([], dtype=int64),)

In [9]:
next_row

car                      WBY1Z21030V307832
time                   2018-07-26 14:37:33
time_to_reservation                    0.0
park_location_lat                55.654072
park_location_long               12.610818
leave_location_lat               55.654072
leave_location_long              12.610818
park_zone                           103222
leave_zone                          103222
park_fuel                               45
leave_fuel                              43
engine                                  I3
moved                                  0.0
prev_customer                        False
next_customer                         True
movedTF                              False
action                               False
Name: 662569, dtype: object

## To PTG

In [17]:
with open('20180710.pickle', 'rb') as f:
    tenth = pickle.load(f)

tmp = tenth[pd.Timestamp('2018-07-10 00:01:27')]

In [35]:
attr, adj = tmp

In [27]:
attr[["prev_customer", "next_customer"]] = attr[["prev_customer", "next_customer"]].astype(int)

In [43]:
pd.get_dummies(attr, columns = ['park_zone', 'leave_zone', 'engine'], prefix = ['pz', 'lz', 'eng'])

Unnamed: 0_level_0,time,time_to_reservation,park_location_lat,park_location_long,leave_location_lat,leave_location_long,park_fuel,leave_fuel,moved,prev_customer,...,lz_185132,lz_185141,lz_185142,lz_185143,lz_185154,lz_185203,eng_118I,eng_COOPER,eng_I3,eng_I3 94
car,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
WBA1R5100J5K57795,2018-07-02 20:03:12,204.759171,55.621643,12.606266,55.621643,12.606266,76,75,0.000000,0,...,0,0,0,0,1,0,1,0,0,0
WBA1R5100J7B13946,2018-07-08 21:59:56,45.506668,55.733349,12.439386,55.733349,12.439386,96,96,0.000000,1,...,0,0,0,0,0,0,1,0,0,0
WBA1R5101J5K57921,2018-07-09 13:36:27,65.890274,55.636936,12.601698,55.636936,12.601698,92,92,0.000000,1,...,0,0,1,0,0,0,1,0,0,0
WBA1R5101J5K57983,2018-07-09 12:32:48,11.879167,55.630409,12.648902,55.630409,12.648902,78,78,0.000000,1,...,0,0,0,0,0,1,1,0,0,0
WBA1R5101J7B13938,2018-07-06 13:08:54,1255.367554,55.693050,12.565165,55.620831,12.607430,46,15,8456.951172,1,...,0,0,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WBY1Z21050V308173,2018-07-09 23:58:43,2.127778,55.685040,12.500325,55.685040,12.500325,59,58,0.000000,0,...,0,0,0,0,0,0,0,0,1,0
WMWXR3106KTK53875,2018-07-09 23:59:12,4.238333,55.654102,12.591456,55.654102,12.591456,65,65,0.000000,1,...,0,0,0,0,0,0,0,1,0,0
WBY1Z21080V307857,2018-07-09 23:59:13,9.128056,55.702923,12.545516,55.702923,12.545516,100,100,0.000000,0,...,0,0,0,0,0,0,0,0,1,0
WBY1Z21020V307966,2018-07-09 23:59:30,6.495555,55.660591,12.605568,55.660591,12.605568,66,92,0.000000,1,...,0,0,0,0,0,0,0,0,1,0


In [36]:
edge_index, edge_weight = utils.convert.from_scipy_sparse_matrix(adj)

In [64]:
all_park_zones = df.park_zone.unique()
all_leave_zones = df.leave_zone.unique()

In [88]:
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer, label_binarize
OH_park_zones = OneHotEncoder(categories = all_park_zones)
MLB_park_zones = MultiLabelBinarizer(classes = all_park_zones)

In [91]:
LB_park_zones = label_binarize(start_df_graph.park_zone, classes = all_park_zones)
sum(LB_park_zones)

array([ 7,  1,  3,  0,  9,  0,  0,  2,  0,  4,  0,  4,  2,  3,  7,  2,  1,
        1,  2,  2,  1,  2,  2,  1,  1,  0,  2,  1,  1,  1,  0,  1,  1,  3,
        2,  1,  1,  0,  3,  0,  2,  4,  0,  4,  0,  3,  3,  1,  3,  1,  0,
        2,  1,  2,  0,  1,  1,  1,  1,  2,  1,  1,  0,  4,  1,  6,  0,  4,
        0,  1,  0,  1,  1,  0,  2,  1,  0,  0,  2,  5,  0,  1,  6,  1,  3,
        2,  6,  2,  7,  3,  0,  1,  0, 11,  1,  0,  1,  0,  1,  1,  0, 12,
        0,  4,  0,  0,  1,  1,  3,  1,  2,  1,  3, 15,  1,  0,  0,  1,  0,
        0,  0,  0,  2,  0, 24,  2,  1,  1,  0,  1,  0,  3,  0,  0,  3,  0,
        0,  0,  2,  0,  0,  9,  1,  3,  0,  0,  2,  2,  1,  0,  0,  0,  3,
        3,  0,  2,  3,  7,  0,  1,  1,  0,  0,  4,  0,  1,  1,  0,  7,  2,
        1,  7,  0,  0,  4,  1,  0,  0,  0,  2,  2,  0,  0,  0,  0,  0,  1,
        0,  3,  1,  0,  0,  1,  0,  0,  1,  2,  1,  4,  0,  2,  3,  0,  2,
        2,  1,  1,  2,  3,  4,  1,  0,  0,  1,  2,  3,  2,  3,  2,  2,  2,
        1,  1,  0,  0,  0

In [None]:
mlb.fit_transform(df['label']),columns=mlb.classes_, index=df.index