In [273]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import glob
import numpy as np
import networkx as nx
import tqdm
import datetime
import geopandas as gpd
import rtree

In [2]:
files = glob.glob("data/raw/SNData/*.csv")

dfs = []
for f in tqdm.tqdm(files):
    dfs.append(pd.read_csv(f, header=0, sep=";"))

Full_data = pd.concat(dfs,ignore_index=True) # Save this to interim
Full_data.to_csv('data/interim/Full_data.csv')

100%|██████████| 53/53 [00:07<00:00,  7.04it/s]


In [3]:
# Drop 53 rows with na values
df = Full_data.dropna()

# Rename Columns to English
df. columns = ['Customer_Group', 'CustomerID', 'CarID', 'Engine', 'Rental_flag', 'RentalID', 'Rental_Usage_Type', 'Reservation_Time', 'End_Time', 'Revenue', 'Distance', 'Drives', 'Reservation_Minutes','Fuel_Start','Fuel_End','Start_Lat', 'Start_Long', 'End_Lat', 'End_Long']

# Fix type
df = df.astype({'CustomerID': 'int32', 'RentalID': 'int64'})

# Drop drives as it has no info (only ones)
df = df.drop(columns = 'Drives')
df

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
0,Non_Customer,793639.0,WBY1Z21080V307924,I3,No,9.335872e+09,Private,24.03.2016 11:48:43,02.04.2016 10:00:19,0.00,0,0,0,0,55.678763,12.552853,0.000000,0.000000
1,Non_Customer,1035973.0,WBY1Z21080V307857,I3,No,9.336114e+09,Private,30.03.2016 15:37:39,01.04.2016 00:40:38,0.00,0,0,62,47,55.770626,12.519300,55.770389,12.518839
2,Non_Customer,998095.0,WBY1Z21020V307904,I3,No,9.336154e+09,Private,31.03.2016 13:08:16,05.04.2016 08:32:25,0.00,2,1,85,79,55.621588,12.606951,55.621532,12.606279
3,Non_Customer,999604.0,WBY1Z21010V307926,I3,No,9.336158e+09,Private,31.03.2016 14:43:00,01.04.2016 07:10:00,0.00,0,1,0,71,55.770077,12.518914,55.769746,12.519123
4,Non_Customer,1035969.0,WBY1Z21070V308210,,No,9.336160e+09,Private,31.03.2016 15:21:36,01.04.2016 14:24:17,0.00,0,1,53,52,55.770623,12.519791,55.770439,12.518937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2633979,Customer,1070662.0,WBY1Z21010V308185,I3,No,9.345011e+09,Private,30.09.2016 23:39:05,30.09.2016 23:50:54,5.16,4,10,46,41,55.694700,12.553776,55.678740,12.587144
2633980,Customer,1041705.0,WBY1Z21080V308250,I3,No,9.345011e+09,Private,30.09.2016 23:42:18,30.09.2016 23:52:14,3.44,6,8,59,52,55.648401,12.542945,55.641310,12.615295
2633981,Customer,2112471.0,WBY1Z21020V308261,I3,No,9.345011e+09,Private,30.09.2016 23:33:39,30.09.2016 23:52:03,8.17,9,3,39,30,55.664744,12.580875,55.719856,12.540863
2633982,Customer,440147.0,WBY1Z21060V307954,I3,Yes,9.345011e+09,Private,30.09.2016 23:41:56,30.09.2016 23:57:30,6.88,9,4,44,35,55.710676,12.566043,55.667453,12.619987


In [4]:
# Remove all rows with a CarID as it can not be used
df = df[df.CarID != '0']

In [5]:
# Engine has two types of missing values that is alligned
df["Engine"].replace({" ": '0'}, inplace=True)

In [6]:
# If a CarID already has an engine type assign that to the missing ones
Engine_dict = {c: df[df.CarID == c].Engine.nunique() for c in df[df.Engine == '0'].CarID.unique()}
for car, engine in Engine_dict.items():
    if engine == 1:
        continue
    True_Engine = [x for x in df[df.CarID == car].Engine.unique() if x!= '0'][0]
    df.loc[(df.CarID == car) & (df.Engine == '0'), 'Engine'] = True_Engine

# Populate the rest manual based on ID
df.loc[(df.CarID == 'WBA1R5104J7B14310') & (df.Engine == '0'), 'Engine'] = '118I'
df.loc[(df.CarID == 'WBA1R5104J5K58061') & (df.Engine == '0'), 'Engine'] = '118I'
df.loc[(df.CarID == 'WBA1R5103K7D66678') & (df.Engine == '0'), 'Engine'] = '118I'
df.loc[(df.CarID == 'WBY8P2105K7D70350') & (df.Engine == '0'), 'Engine'] = 'I3 120'
df.loc[(df.CarID == 'WBY8P2102K7D70287') & (df.Engine == '0'), 'Engine'] = 'I3 120'

## Times

In [7]:
df['Reservation_Time'] = pd.to_datetime(df['Reservation_Time'], format="%d.%m.%Y %H:%M:%S")
df['End_Time'] = pd.to_datetime(df['End_Time'], format="%d.%m.%Y %H:%M:%S")

## Fix trips where same user use same car

In [None]:
# Split data on Car level
CarID_dict = dict(iter(df.groupby('CarID')))

def fix_merges(dataframe, max_time_diff = 60):
    # Get index where same customer uses the same car back to back
    diff0_iloc = [dataframe.index.get_loc(x) for x in dataframe.index[(dataframe.CustomerID.diff() == 0).tolist()]]

    # Find paris to be merged
    merge_pairs = [(idx-1,idx) for idx in diff0_iloc if dataframe.iloc[idx-1].End_Time+pd.to_timedelta(max_time_diff+dataframe.iloc[idx].Reservation_Minutes,'m') > dataframe.iloc[idx].Reservation_Time]

    # Model as graph to get cc
    graph_model = nx.Graph(merge_pairs)
    groups = [(min(cc),max(cc)) for cc in list(nx.connected_components(graph_model))]

    # Populate 
    for pair in groups:
        dataframe.loc[dataframe.index[pair[0]],['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']] = dataframe.loc[dataframe.index[pair[1]],['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']]


    # Delete now unwanted rows
    rows_to_delete = [x[1] for x in merge_pairs]
    dataframe.drop(index = [dataframe.index[x] for x in rows_to_delete], inplace = True)

    # Return fixed dataframe
    return dataframe

# Merge new datasets
dfs = []
for sub_df in tqdm.tqdm(CarID_dict.values()):
    dfs.append(fix_merges(sub_df))

df = pd.concat(dfs,ignore_index=True)

## Fix 0,0 locations

We also accept the other ones outside Copenhagen as the cars must have been there. They can be removed in the vacancy dataset

In [8]:
for i, row in df[(df.Start_Lat < 5)].iterrows():
    # Skip if first instance as it will unaffect vacancy
    sub_df = df[df.CarID == row.CarID].sort_values('RentalID')
    err_index = sub_df.index.get_loc(i)
    if err_index == 0:
        continue

    # Populate based on previous end 
    df.loc[i, ['Start_Lat', 'Start_Long']] = sub_df.iloc[err_index-1].loc[['End_Lat','End_Long']].values

In [156]:
for i, row in df[(df.End_Lat < 5)].iterrows():
    sub_df = df[df.CarID == row.CarID].sort_values('RentalID')
    err_index = sub_df.index.get_loc(i)

    # Will fail if last index
    try:
        df.loc[i, ['End_Lat', 'End_Long']] = sub_df.iloc[err_index+1].loc[['Start_Lat','Start_Long']].values
    except:
        continue
    

## Add zones

In [157]:
# Load shapefile and set projection
shapefile = gpd.read_file("../Zonekort/LTM_Zone3/zones_level3.shp")
shapefile = shapefile.to_crs(epsg=4326)

In [158]:
# Create a geoDF with geometry as starting point
gdf_start = gpd.GeoDataFrame(df, geometry= gpd.points_from_xy(df.Start_Long, df.Start_Lat))

# Set projection
gdf_start = gdf_start.set_crs(epsg=4326)

In [159]:
# Populate zones based on which zone they are within
gdpj_start  = gpd.sjoin(gdf_start, shapefile, op='within')
df['Start_Zone'] = gdpj_start.zoneid

In [160]:
# Populate the rest based on which zone they are closest too
Start_zone_filler = {x: shapefile.zoneid[shapefile.distance(df.loc[x].geometry).sort_values().index[0]] for x in df.index[df['Start_Zone'].isna()]}
df['Start_Zone'] = df['Start_Zone'].fillna(Start_zone_filler)


  Start_zone_filler = {x: shapefile.zoneid[shapefile.distance(df.loc[x].geometry).sort_values().index[0]] for x in df.index[df['Start_Zone'].isna()]}


In [161]:
# Create a geoDF with geometry as end point
gdf_end = gpd.GeoDataFrame(df, geometry= gpd.points_from_xy(df.End_Long, df.End_Lat))

# Set projection
gdf_end = gdf_end.set_crs(epsg=4326)

In [162]:
# Populate zones based on which zone they are within
gdpj_end  = gpd.sjoin(gdf_end, shapefile, op='within')
df['End_Zone'] = gdpj_end.zoneid

In [163]:
# Populate the rest based on which zone they are closest too
End_zone_filler = {x: shapefile.zoneid[shapefile.distance(df.loc[x].geometry).sort_values().index[0]] for x in df.index[df['End_Zone'].isna()]}
df['End_Zone'] = df['End_Zone'].fillna(End_zone_filler)


  End_zone_filler = {x: shapefile.zoneid[shapefile.distance(df.loc[x].geometry).sort_values().index[0]] for x in df.index[df['End_Zone'].isna()]}


In [164]:
# Remove geomery type and make IDs int columns
df = df.drop(columns = 'geometry')
df = df.astype({'CustomerID': 'int32', 'RentalID': 'int64', 'Start_Zone': 'int32','End_Zone': 'int32'})

In [165]:
# Check types
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2630477 entries, 0 to 2633983
Data columns (total 21 columns):
 #   Column               Dtype         
---  ------               -----         
 0   Customer_Group       object        
 1   CustomerID           int32         
 2   CarID                object        
 3   Engine               object        
 4   Rental_flag          object        
 5   RentalID             int64         
 6   Rental_Usage_Type    object        
 7   Reservation_Time     datetime64[ns]
 8   End_Time             datetime64[ns]
 9   Revenue              float64       
 10  Distance             int64         
 11  Reservation_Minutes  int64         
 12  Fuel_Start           int64         
 13  Fuel_End             int64         
 14  Start_Lat            float64       
 15  Start_Long           float64       
 16  End_Lat              float64       
 17  End_Long             float64       
 18  Start_Time           datetime64[ns]
 19  Start_Zone           

In [11]:
# Sweden and Bornholm
#df[df.Start_Long > 13].sort_values(by = 'Reservation_Time')

In [186]:
# Jutland
df[(df.Start_Long < 11) & (df.Start_Long > 0) & (df.Customer_Group == 'Customer')]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
1670333,Customer,3603143.0,WBA31AA08L3H87538,X1 SDRIVE18I,No,9427403000.0,Private,2019-10-22 11:32:19,2019-10-22 17:28:16,0.0,391,1,18,84,56.234743,8.202535,55.696808,12.52799
1723321,Customer,898121.0,WBA1R510XJ5K57982,118I,No,9401443000.0,Private,2019-04-10 15:54:48,2019-04-10 15:57:12,0.0,0,1,53,53,56.137908,10.198739,56.137908,10.198739
2520877,Customer,2323038.0,WBA1R5102J5K58091,118I,No,9391161000.0,Private,2018-12-22 15:08:17,2018-12-22 18:19:38,10.0,168,76,69,36,55.400041,10.392545,55.70964,12.5646


In [213]:
#df[df.CarID == 'WBA31AA08L3H87538'].iloc[-260:-250]
#df[df.CarID == 'WBA1R510XJ5K57982'].sort_values(by='RentalID').iloc[1560:1570]
df[df.CarID == 'WBA1R5102J5K58091'].sort_values(by='RentalID').iloc[1030:1040]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
2518534,Customer,2112930.0,WBA1R5102J5K58091,118I,No,9391107000.0,Private,2018-12-21 18:18:27,2018-12-21 18:42:39,6.4,5,11,96,94,55.668159,12.545845,55.676779,12.598227
2519316,Customer,1074502.0,WBA1R5102J5K58091,118I,No,9391123000.0,Private,2018-12-21 22:55:51,2018-12-21 23:31:29,44.0,8,14,94,94,55.676779,12.598227,55.632555,12.574592
2519372,Customer,2111583.0,WBA1R5102J5K58091,118I,No,9391124000.0,Private,2018-12-21 23:33:59,2018-12-21 23:49:39,38.4,5,5,94,94,55.632555,12.574592,55.65162,12.550483
2519794,Customer,2594312.0,WBA1R5102J5K58091,118I,No,9391132000.0,Private,2018-12-22 08:01:20,2018-12-22 08:24:49,33.6,8,10,94,94,55.65162,12.550483,55.702349,12.553621
2519938,Customer,2787952.0,WBA1R5102J5K58091,118I,No,9391136000.0,Private,2018-12-22 09:19:15,2018-12-22 09:49:24,48.0,7,15,94,94,55.702349,12.553621,55.730999,12.568196
2520590,Customer,2323038.0,WBA1R5102J5K58091,118I,No,9391156000.0,Private,2018-12-22 10:53:05,2018-12-22 14:34:27,0.0,176,10,92,69,55.730999,12.568196,55.400041,10.392545
2520877,Customer,2323038.0,WBA1R5102J5K58091,118I,No,9391161000.0,Private,2018-12-22 15:08:17,2018-12-22 18:19:38,10.0,168,76,69,36,55.400041,10.392545,55.70964,12.5646
2521299,Customer,809426.0,WBA1R5102J5K58091,118I,No,9391169000.0,Business,2018-12-22 18:29:34,2018-12-22 19:08:53,67.2,4,19,36,34,55.70964,12.5646,55.688064,12.532981
2521371,Customer,3068156.0,WBA1R5102J5K58091,118I,No,9391171000.0,Private,2018-12-22 19:14:02,2018-12-23 01:09:54,320.0,103,3,34,15,55.688064,12.532981,55.695059,12.538034
2522188,Customer,2824769.0,WBA1R5102J5K58091,118I,No,9391189000.0,Private,2018-12-23 07:54:09,2018-12-23 08:10:18,44.8,4,3,13,15,55.695059,12.538034,55.667226,12.549202


In [9]:
#df[df.Start_Lat>0].sort_values(by = 'Start_Lat')

In [12]:
# Car in Germany in the middle of the data..
#df[df.CarID == 'WBY1Z21040V308181'].sort_values(by = 'Reservation_Time').iloc[-30:-20]

## Weird times

In [21]:
# Winter Time
WinterTimeIndex = df[(df.Reservation_Time > df.End_Time) & (df.End_Time.apply(lambda x: x.month) == 10) & (tmp.End_Time.apply(lambda x: x.hour) < 4)].index
WinterTimeIndexBack = [2179859, 2179865, 1683947, 1683948]
WinterTimeIndexForward = [x for x in WinterTimeIndex if x not in WinterTimeIndexBack]
df.loc[WinterTimeIndexBack, 'Reservation_Time'] = df.loc[WinterTimeIndexBack, 'Reservation_Time'] - pd.to_timedelta(1,'h')
df.loc[WinterTimeIndexForward, 'End_Time'] = df.loc[WinterTimeIndexForward, 'End_Time'] + pd.to_timedelta(1,'h')

In [56]:
# Remove observations of cars going out of service
OutOfServiceIndex = df[(df.Reservation_Time > df.End_Time) & (df.End_Time.apply(lambda x: x.hour) == 0) & (df.End_Time.apply(lambda x: x.minute) == 0)].index
df = df.drop(index = OutOfServiceIndex)

In [122]:
df[df.Reservation_Time > df.End_Time].sort_values(by = 'RentalID') # I would drop them since they will not introduce more vacancy time. 39 unique from 41 obs

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
1596912,Non_Customer,819723.0,WBY1Z21000V307870,I3,No,9327687000.0,Private,2015-09-15 08:29:39,2015-09-15 08:17:29,0.0,4,4070,62,56,55.675709,12.543872,55.696982,12.528448
1597105,Non_Customer,819723.0,WBY1Z21000V307870,I3,No,9327709000.0,Private,2015-09-15 15:28:32,2015-09-15 15:23:47,0.0,1,359,51,49,55.674612,12.560151,55.676148,12.559185
1598329,Non_Customer,793639.0,WBY1Z21080V308202,I3,No,9327795000.0,Private,2015-09-17 12:07:09,2015-09-17 07:25:39,0.0,0,1219,82,82,55.666756,12.546134,55.66673,12.546229
1599057,Non_Customer,803953.0,WBY1Z210X0V308041,I3,No,9327850000.0,Private,2015-09-18 13:55:04,2015-09-18 13:53:03,0.0,0,305,49,48,55.653059,12.615527,55.653404,12.61534
1601096,Non_Customer,793639.0,WBY1Z21000V307898,I3,No,9327944000.0,Private,2015-09-20 19:14:44,2015-09-20 18:48:50,0.0,21,2834,47,28,55.676246,12.559212,55.668684,12.543248
1602012,Non_Customer,803953.0,WBY1Z21050V308092,I3,No,9328031000.0,Private,2015-09-22 14:21:29,2015-09-22 14:20:31,0.0,1,6,38,36,55.683347,12.504722,55.68337,12.504774
1605308,Customer,823507.0,WBY1Z21080V307969,I3,No,9328244000.0,Private,2015-09-27 12:46:41,2015-09-27 07:46:58,0.0,14,952,38,11,55.631229,12.602787,55.63086,12.603807
1605310,Customer,837034.0,WBY1Z21010V308218,I3,No,9328244000.0,Private,2015-09-27 12:47:35,2015-09-27 07:46:56,6.62,9,1010,44,34,55.674052,12.503899,55.708211,12.57512
1605311,Customer,812736.0,WBY1Z21090V307995,I3,No,9328244000.0,Private,2015-09-27 12:47:17,2015-09-27 07:46:56,0.0,18,0,42,6,55.672627,12.485135,55.67203,12.495505
1605716,Non_Customer,793639.0,WBY1Z21050V307847,I3,No,9328261000.0,Private,2015-09-27 17:39:53,2015-09-27 16:59:07,0.0,10,272,21,12,55.706551,12.517922,55.696049,12.570738


### Back-to-back from same customer

In [334]:
df = df.astype({'CustomerID': 'int32', 'RentalID': 'int64'})


In [338]:
# Split data on Car level
CarID_dict = dict(iter(df.groupby('CarID')))

def fix_merges(dataframe, max_time_diff = 60):
    # Get index where same customer uses the same car back to back
    diff0_iloc = [dataframe.index.get_loc(x) for x in dataframe.index[(dataframe.CustomerID.diff() == 0).tolist()]]

    # Find paris to be merged
    merge_pairs = [(idx-1,idx) for idx in diff0_iloc if dataframe.iloc[idx-1].End_Time+pd.to_timedelta(max_time_diff+dataframe.iloc[idx].Reservation_Minutes,'m') > dataframe.iloc[idx].Reservation_Time]

    # Model as graph to get cc
    graph_model = nx.Graph(merge_pairs)
    groups = [(min(cc),max(cc)) for cc in list(nx.connected_components(graph_model))]

    # Populate 
    for pair in groups:
        dataframe.loc[dataframe.index[pair[0]],['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']] = dataframe.loc[dataframe.index[pair[1]],['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']]


    # Delete now unwanted rows
    rows_to_delete = [x[1] for x in merge_pairs]
    dataframe.drop(index = [dataframe.index[x] for x in rows_to_delete], inplace = True)

    # Return fixed dataframe
    return dataframe

# Merge new datasets
dfs = []
for sub_df in tqdm.tqdm(CarID_dict.values()):
    dfs.append(fix_merges(sub_df))

df = pd.concat(dfs,ignore_index=True)

In [340]:
dfs = []
for sub_df in tqdm.tqdm(CarID_dict.values()):
    dfs.append(fix_merges(sub_df))

df = pd.concat(dfs,ignore_index=True)

100%|██████████| 1021/1021 [01:02<00:00, 16.28it/s]


In [22]:
# How should we treat: 1h threshold (make varialbe)
df.loc[[2186270,2186286,2186449]]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
2186270,Customer,2146082.0,WMWXU7108KTM90583,COOPER,No,9387938000.0,Private,2018-10-30 14:29:13,2018-10-30 14:33:13,3.2,0,4,0,0,55.662179,12.534083,55.662179,12.534083
2186286,Customer,2146082.0,WMWXU7108KTM90583,COOPER,No,9387938000.0,Business,2018-10-30 14:33:53,2018-10-30 14:42:56,13.87,2,1,30,30,55.662179,12.534083,55.669684,12.546812
2186449,Customer,2146082.0,WMWXU7108KTM90583,COOPER,No,9387941000.0,Business,2018-10-30 15:27:20,2018-10-30 15:38:09,8.67,1,6,30,27,55.669684,12.546812,55.664356,12.538918


In [35]:
df.loc[[2608261,2608337,2608583]]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
2608261,Customer,2982250.0,WMWXU7108KTM90583,COOPER,No,9380426000.0,Private,2018-06-29 19:03:33,2018-06-29 19:17:29,19.2,4,3,77,75,55.727936,12.340871,55.730169,12.355415
2608337,Customer,2982250.0,WMWXU7108KTM90583,COOPER,No,9380428000.0,Private,2018-06-29 19:31:20,2018-06-29 19:57:11,28.8,6,9,75,75,55.730169,12.355415,55.728185,12.340968
2608583,Customer,2982250.0,WMWXU7108KTM90583,COOPER,No,9380434000.0,Private,2018-06-29 21:17:30,2018-06-29 21:23:16,6.4,2,2,75,72,55.728185,12.340968,55.730071,12.355405


In [37]:
df.loc[[1510,1546]]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
1510,Customer,1037492.0,WBY1Z21020V308146,I3,No,9336274000.0,Private,2016-04-02 20:08:20,2016-04-02 20:27:16,8.16,0,10,73,73,55.68253,12.581634,55.68253,12.581625
1546,Customer,1037492.0,WBY1Z21020V308146,I3,No,9336275000.0,Private,2016-04-02 20:34:12,2016-04-02 20:58:38,2.15,15,1,73,55,55.68253,12.581625,55.786269,12.521972


### What about start-time

In [123]:
testdf = df.copy(deep=True)
testdf['Start_Time'] = [row.Reservation_Time+datetime.timedelta(minutes=row.Reservation_Minutes) for _, row in testdf.iterrows()]

In [214]:
testdf[testdf.Start_Time-pd.to_timedelta(1,'m') > testdf.End_Time].sort_values(by = 'RentalID') # Using Reservation Time in the data should eleminate the risk of having trips at the same time

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long,Start_Time
1969600,Non_Customer,793664.0,WBY1Z21030V308138,I3,No,9.326529e+09,Private,2015-08-19 14:52:45,2015-08-19 14:53:35,0.0,0,4,97,97,55.437567,11.814045,55.437079,11.816411,2015-08-19 14:56:45
1969601,Customer,790044.0,WBY1Z21010V307960,I3,No,9.326529e+09,Private,2015-08-19 15:12:24,2015-08-19 15:12:53,0.0,0,4,68,68,55.671158,12.583635,55.671159,12.583636,2015-08-19 15:16:24
1969652,Non_Customer,793664.0,WBY1Z21070V308224,I3,No,9.326806e+09,Private,2015-08-26 09:45:51,2015-08-26 09:48:27,0.0,0,6,85,84,55.691981,12.619608,55.691980,12.619609,2015-08-26 09:51:51
1969666,Non_Customer,793664.0,WBY1Z21090V308256,I3,No,9.326809e+09,Private,2015-08-26 10:37:22,2015-08-26 10:38:15,0.0,0,5,78,78,55.692095,12.619449,55.692095,12.619450,2015-08-26 10:42:22
1969680,Non_Customer,793664.0,WBY1Z21040V308214,I3,No,9.326815e+09,Private,2015-08-26 13:01:52,2015-08-26 13:06:38,0.0,0,9,85,85,55.692176,12.619752,55.691739,12.619268,2015-08-26 13:10:52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1411464,Non_Customer,793639.0,WMWXU7106KTM91733,COOPER,No,9.430461e+09,Private,2019-12-14 12:18:02,2019-12-14 13:33:37,0.0,18,150,62,52,55.706162,12.534806,55.630225,12.648690,2019-12-14 14:48:02
1415068,Non_Customer,793639.0,WBY8P2107K7E72622,I3 120,No,9.430505e+09,Private,2019-12-15 14:21:42,2019-12-15 14:51:06,0.0,18,116,94,85,55.622686,12.615552,55.664729,12.540505,2019-12-15 16:17:42
1418806,Non_Customer,793639.0,WMWXR3102KTK54716,COOPER,No,9.430574e+09,Private,2019-12-16 18:45:19,2019-12-16 20:37:48,0.0,21,130,90,81,55.714542,12.564363,55.630140,12.649162,2019-12-16 20:55:19
1425507,Non_Customer,793639.0,WBA31AA04L3L04754,X1 SDRIVE18I,No,9.430690e+09,Private,2019-12-19 06:30:43,2019-12-19 08:07:53,0.0,0,101,98,98,55.634405,12.649611,55.634405,12.649611,2019-12-19 08:11:43


In [125]:
testdf_sorted = testdf.sort_values(by = 'RentalID').sort_values(by = ['CarID','RentalID'])

In [81]:
bad_idx = testdf[testdf.Start_Time > testdf.End_Time].sort_values(by = 'RentalID').index
bad_idx

Int64Index([1969584, 1969593, 1969597, 1969598, 1969600, 1969601, 1969649,
            1969652, 1969666, 1969680,
            ...
            1447696, 1449028, 1449260, 1453217, 1454055, 1454153, 1456301,
            1456890, 1457588, 1457729],
           dtype='int64', length=38612)

In [93]:
bad_iloc_idx = [testdf_sorted.index.get_loc(x) for x in bad_idx]
bad_iloc_idx[1000]

425716

In [94]:
testdf_sorted.iloc[(425716-1):(425716+2)]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long,Start_Time
1598740,Customer,824476.0,WBY1Z21000V308131,I3,No,9327820000.0,Private,2015-09-17 21:58:19,2015-09-17 22:13:25,6.78,3,8,99,95,55.673198,12.593009,55.675069,12.547551,2015-09-17 22:06:19
1599000,Customer,825449.0,WBY1Z21000V308131,I3,No,9327845000.0,Private,2015-09-18 12:24:54,2015-09-18 12:32:15,3.39,2,18,95,94,55.675069,12.547551,55.667909,12.542775,2015-09-18 12:42:54
1599991,Customer,812557.0,WBY1Z21000V308131,I3,No,9327903000.0,Private,2015-09-19 15:20:39,2015-09-19 15:36:15,6.78,4,0,94,89,55.667909,12.542775,55.677641,12.582207,2015-09-19 15:20:39


### Trips longer than 5 days

In [147]:
tmp2 = df[df.Reservation_Time+pd.to_timedelta(5,'d') < df.End_Time].sort_values(by = 'RentalID')
#tmp2[tmp2.Customer_Group == 'Customer'] # 1152167 should be 02-26 as reservation. Rest is good even though 601154 has a very close follow up
tmp2[tmp2.Customer_Group == 'Non_Customer']

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
1969590,Non_Customer,790031.0,WBY1Z21090V307818,I3,No,9.326442e+09,Private,2015-08-17 13:59:14,2015-08-27 14:48:58,0.0,334,0,16,99,55.726729,12.582682,55.671036,12.583519
1591238,Non_Customer,808838.0,WBY1Z210X0V307780,I3,No,9.327127e+09,Private,2015-09-02 16:28:12,2015-09-13 12:19:27,0.0,351,0,3,99,55.706360,12.529268,55.674361,12.560594
1593151,Non_Customer,817026.0,WBY1Z21060V308053,I3,No,9.327351e+09,Private,2015-09-07 17:30:34,2015-09-24 14:08:26,0.0,581,0,19,99,55.674417,12.560560,55.676246,12.559132
1596004,Non_Customer,808838.0,WBY1Z210X0V307780,I3,No,9.327605e+09,Private,2015-09-13 13:12:02,2015-09-29 09:57:05,0.0,499,0,9,99,55.674361,12.560594,55.737039,12.477291
1603325,Non_Customer,817026.0,WBY1Z21030V307927,I3,No,9.328125e+09,Private,2015-09-24 14:37:50,2015-09-30 11:47:42,0.0,266,0,87,55,55.676015,12.559150,55.630216,12.650906
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1878420,Non_Customer,893118.0,WBY8P2101K7E71739,I3 120,No,9.427740e+09,Private,2019-10-28 07:42:03,2019-11-04 14:19:04,0.0,1,2,59,55,55.621566,12.606274,55.621597,12.606271
1688993,Non_Customer,793639.0,WBY1Z21090V308094,I3,No,9.427801e+09,Private,2019-10-07 03:58:33,2019-10-29 16:50:51,0.0,0,31834,5,0,55.716240,12.567010,0.000000,0.000000
1689528,Non_Customer,793639.0,WBY1Z21020V308275,I3,No,9.427811e+09,Private,2019-10-21 03:19:49,2019-10-29 16:50:50,0.0,0,11899,0,0,55.716240,12.567011,0.000000,0.000000
1935517,Non_Customer,793639.0,WBY8P2105K7D87701,I3 120,No,9.429062e+09,Private,2019-11-12 11:22:34,2019-11-20 16:59:58,0.0,0,11361,0,0,55.663175,12.586320,0.000000,0.000000


In [148]:
subdf = df[df.CarID == 'WBY8P2101K7E71739'].sort_values(by = 'RentalID')
tidx = subdf.index.get_loc(1878420)
subdf.iloc[(tidx-5):(tidx+5)]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
1683723,Non_Customer,1109469.0,WBY8P2101K7E71739,I3 120,No,9427687000.0,Private,2019-10-26 21:34:23,2019-10-26 23:31:17,0.0,1,112,38,82,55.645999,12.640459,55.6568,12.636684
1683912,Customer,838748.0,WBY8P2101K7E71739,I3 120,No,9427691000.0,Private,2019-10-27 01:23:33,2019-10-27 01:46:40,30.4,9,5,82,79,55.6568,12.636684,55.672651,12.543489
1684274,Customer,854489.0,WBY8P2101K7E71739,I3 120,No,9427697000.0,Private,2019-10-27 09:24:26,2019-10-27 12:11:03,240.0,42,17,79,64,55.672651,12.543489,55.672197,12.542612
1684914,Customer,2115086.0,WBY8P2101K7E71739,I3 120,No,9427708000.0,Private,2019-10-27 13:11:30,2019-10-27 13:24:01,16.13,1,6,64,63,55.672197,12.542612,55.676154,12.560266
1685315,Non_Customer,793639.0,WBY8P2101K7E71739,I3 120,No,9427714000.0,Private,2019-10-27 13:29:05,2019-10-27 15:54:13,0.0,9,126,63,59,55.676154,12.560266,55.621566,12.606274
1878420,Non_Customer,893118.0,WBY8P2101K7E71739,I3 120,No,9427740000.0,Private,2019-10-28 07:42:03,2019-11-04 14:19:04,0.0,1,2,59,55,55.621566,12.606274,55.621597,12.606271
1890748,Non_Customer,793639.0,WBY8P2101K7E71739,I3 120,No,9428188000.0,Private,2019-11-04 17:39:43,2019-11-04 19:24:13,0.0,4,94,54,51,55.621597,12.606271,55.630185,12.648978
1890850,Customer,3640148.0,WBY8P2101K7E71739,I3 120,No,9428190000.0,Private,2019-11-04 19:53:00,2019-11-04 20:24:37,17.56,39,1,51,33,55.630185,12.648978,55.718748,12.554637
1891356,Non_Customer,1109469.0,WBY8P2101K7E71739,I3 120,No,9428199000.0,Private,2019-11-04 21:41:14,2019-11-05 00:47:34,0.0,2,179,31,100,55.718748,12.554637,55.708983,12.550849
1891589,Customer,2288309.0,WBY8P2101K7E71739,I3 120,No,9428204000.0,Private,2019-11-05 07:09:55,2019-11-05 07:48:34,4.18,14,13,100,94,55.708983,12.550849,55.73331,12.444929


In [160]:
tmp2.iloc[:20]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
1969590,Non_Customer,790031.0,WBY1Z21090V307818,I3,No,9326442000.0,Private,2015-08-17 13:59:14,2015-08-27 14:48:58,0.0,334,0,16,99,55.726729,12.582682,55.671036,12.583519
1591238,Non_Customer,808838.0,WBY1Z210X0V307780,I3,No,9327127000.0,Private,2015-09-02 16:28:12,2015-09-13 12:19:27,0.0,351,0,3,99,55.70636,12.529268,55.674361,12.560594
1593151,Non_Customer,817026.0,WBY1Z21060V308053,I3,No,9327351000.0,Private,2015-09-07 17:30:34,2015-09-24 14:08:26,0.0,581,0,19,99,55.674417,12.56056,55.676246,12.559132
1596004,Non_Customer,808838.0,WBY1Z210X0V307780,I3,No,9327605000.0,Private,2015-09-13 13:12:02,2015-09-29 09:57:05,0.0,499,0,9,99,55.674361,12.560594,55.737039,12.477291
1603325,Non_Customer,817026.0,WBY1Z21030V307927,I3,No,9328125000.0,Private,2015-09-24 14:37:50,2015-09-30 11:47:42,0.0,266,0,87,55,55.676015,12.55915,55.630216,12.650906
706352,Non_Customer,819723.0,WBY1Z21010V307859,I3,No,9328433000.0,Private,2015-09-25 12:55:42,2015-10-01 10:07:13,0.0,0,0,17,0,55.652858,12.613396,0.0,0.0
727799,Non_Customer,793639.0,WBY1Z21060V307811,I3,No,9330365000.0,Private,2015-11-14 16:45:08,2015-12-02 12:03:44,0.0,0,0,30,30,55.679698,12.609623,55.674791,12.605215
733006,Non_Customer,793639.0,WBY1Z21050V308058,I3,No,9331390000.0,Private,2015-12-08 09:29:30,2015-12-16 15:03:55,0.0,14,0,55,34,55.684804,12.537323,55.769769,12.519382
734244,Non_Customer,793639.0,WBY1Z21050V308111,I3,No,9331489000.0,Private,2015-12-10 13:47:52,2015-12-18 16:06:27,0.0,0,0,6,77,55.676097,12.56057,55.620474,12.607469
734806,Non_Customer,793639.0,WBY1Z21040V307953,I3,No,9331523000.0,Private,2015-12-11 09:52:45,2015-12-18 16:02:21,0.0,6,0,2,99,55.730663,12.361931,55.621929,12.604761


In [179]:
subdf = df[df.CarID == 'WBY1Z21060V308179'].sort_values(by = 'RentalID')
tidx = subdf.index.get_loc(1496325)
subdf.iloc[(tidx-3):(tidx+3)]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
1799288,Customer,2476032.0,WBY1Z21060V308179,I3,No,9403198000.0,Private,2019-05-06 16:25:58,2019-05-06 16:39:34,41.6,5,1,86,81,55.675273,12.583606,55.650445,12.552424
1799467,Customer,2852842.0,WBY1Z21060V308179,I3,No,9403201000.0,Private,2019-05-06 17:01:06,2019-05-06 17:48:21,69.6,8,18,81,74,55.650445,12.552424,55.648361,12.469846
1800666,Non_Customer,1109469.0,WBY1Z21060V308179,I3,No,9403225000.0,Private,2019-05-06 20:38:41,2019-05-07 04:15:36,0.0,18,417,66,79,55.648361,12.469846,55.634538,12.648752
1496325,Non_Customer,793639.0,WBY1Z21060V308179,I3,No,9407713000.0,0,2019-06-13 11:24:22,2019-06-18 16:08:28,0.0,0,0,0,0,55.62993,12.650375,0.0,0.0
1496324,Non_Customer,793639.0,WBY1Z21060V308179,I3,No,9407713000.0,Private,2019-06-13 11:24:18,2019-06-18 16:08:28,0.0,0,0,0,0,55.62993,12.650375,0.0,0.0


In [176]:
colc = []
for i, row in tmp2.iterrows():
    subdf = df[df.CarID == row.CarID].sort_values(by = 'RentalID')
    tidx = subdf.index.get_loc(i)
    if tidx+1 == len(subdf):
        colc.append(False)
        continue
    try:
        colc.append(subdf.iloc[tidx]['End_Time']>=subdf.iloc[tidx+1]['Reservation_Time'])
    except:
        print(i)
    

In [177]:
tmp2[colc]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
2582120,Non_Customer,793639.0,WBA1R5104J5K58092,118I,No,9379723000.0,Private,2018-06-09 12:13:53,2018-06-19 12:42:35,0.0,1,14420,75,75,55.675362,12.551718,55.651274,12.491368
1496325,Non_Customer,793639.0,WBY1Z21060V308179,I3,No,9407713000.0,0,2019-06-13 11:24:22,2019-06-18 16:08:28,0.0,0,0,0,0,55.62993,12.650375,0.0,0.0


## Other columns

In [29]:
df[df.Rental_flag == 'Yes']

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,...,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long,Start_Time,Start_Zone,End_Zone
92,Customer,1041131,WBY1Z21030V307927,I3,Yes,9336190387,Private,2016-01-04 08:14:43,2016-01-04 08:32:31,8.80,...,8,57,50,55.671970,12.561281,55.654693,12.612746,2016-01-04 08:22:43,102182,103224
162,Customer,341960,WBY1Z210X0V308248,I3,Yes,9336195348,Private,2016-01-04 09:13:18,2016-01-04 09:26:30,6.01,...,3,100,90,55.666223,12.544056,55.632389,12.574830,2016-01-04 09:16:18,102821,103291
226,Customer,19617,WBY1Z21020V308048,I3,Yes,9336201372,Private,2016-01-04 11:08:34,2016-01-04 11:43:16,15.03,...,4,71,57,55.687231,12.549290,55.692560,12.546405,2016-01-04 11:12:34,102444,102453
227,Customer,24827,WBY1Z21040V307791,I3,Yes,9336201380,Private,2016-01-04 11:08:48,2016-01-04 11:49:28,18.68,...,9,62,51,55.632424,12.644685,55.681026,12.604476,2016-01-04 11:17:48,185125,103132
298,Customer,110192,WBY1Z210X0V308072,I3,Yes,9336207064,Private,2016-01-04 12:49:24,2016-01-04 13:12:59,11.38,...,9,55,49,55.708624,12.578704,55.671733,12.539777,2016-01-04 12:58:24,102343,147131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2633885,Customer,440147,WBY1Z21060V307954,I3,Yes,9345006009,Private,2016-09-30 20:54:26,2016-09-30 21:16:58,9.88,...,8,55,44,55.662403,12.623958,55.710676,12.566043,2016-09-30 21:02:26,103151,102412
2633903,Customer,1032015,WBY1Z21010V307814,I3,Yes,9345007233,Private,2016-09-30 21:16:24,2016-09-30 22:02:05,16.12,...,11,83,97,55.662269,12.604593,55.685559,12.586609,2016-09-30 21:27:24,103172,102223
2633919,Customer,457665,WBY1Z21060V308070,I3,Yes,9345008207,Private,2016-09-30 21:52:27,2016-09-30 22:29:45,22.67,...,1,56,25,55.630208,12.648793,55.665015,12.556939,2016-09-30 21:53:27,185203,102812
2633966,Customer,427906,WBY1Z21080V307955,I3,Yes,9345010629,Private,2016-09-30 23:02:50,2016-09-30 23:13:13,5.37,...,1,97,92,55.665828,12.565080,55.682833,12.584502,2016-09-30 23:03:50,102181,102223


In [31]:
df.sort_values(by = 'Reservation_Minutes', ascending=False).iloc[:42]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,...,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long,Start_Time,Start_Zone,End_Zone
1493665,Non_Customer,793639,WBY1Z21050V307993,I3,No,9407597629,Private,2019-02-05 14:44:54,2019-06-18 16:08:50,0.0,...,59009,0,0,55.634966,12.650418,0.0,0.0,2019-03-18 14:13:54,185121,550062
1508476,Non_Customer,793639,WBY1Z21050V307766,I3,No,9408404530,Private,2019-05-13 14:02:03,2019-06-18 16:08:51,0.0,...,50140,0,0,55.635113,12.650348,0.0,0.0,2019-06-17 09:42:03,185121,550062
1508467,Non_Customer,793639,WBY1Z21060V307999,I3,No,9408400365,Private,2019-05-13 14:05:58,2019-06-18 16:08:50,0.0,...,50136,0,0,55.634987,12.650426,0.0,0.0,2019-06-17 09:41:58,185121,550062
1508474,Non_Customer,793639,WBY1Z21030V307989,I3,No,9408401227,Private,2019-05-13 14:06:03,2019-06-18 16:08:51,0.0,...,50135,0,0,55.635009,12.65049,0.0,0.0,2019-06-17 09:41:03,185121,550062
1499203,Non_Customer,793639,WBY1Z210X0V307911,I3,No,9408102693,Private,2019-05-13 13:22:19,2019-06-18 16:08:50,0.0,...,45960,0,0,55.634986,12.650334,0.0,0.0,2019-06-14 11:22:19,185121,550062
1499161,Non_Customer,793639,WBY1Z21080V308040,I3,No,9408101113,Private,2019-05-13 13:19:56,2019-06-18 16:08:51,0.0,...,45949,0,0,55.634991,12.650396,0.0,0.0,2019-06-14 11:08:56,185121,550062
1511323,Non_Customer,793639,WBY1Z21060V308098,I3,No,9408845467,Private,2019-05-27 00:54:05,2019-06-18 16:08:49,0.0,...,32225,0,0,55.634453,12.649101,0.0,0.0,2019-06-18 09:59:05,185121,550062
1688993,Non_Customer,793639,WBY1Z21090V308094,I3,No,9427800504,Private,2019-07-10 03:58:33,2019-10-29 16:50:51,0.0,...,31834,5,0,55.71624,12.56701,0.0,0.0,2019-08-01 06:32:33,102412,550062
1511307,Non_Customer,793639,WBY1Z210X0V308007,I3,No,9408845049,Private,2019-05-27 09:11:10,2019-06-18 16:08:49,0.0,...,31722,0,0,55.635099,12.650519,0.0,0.0,2019-06-18 09:53:10,185121,550062
1763108,Non_Customer,793639,WBA1R5104J5K58061,118I,No,9402424970,Private,2019-03-04 19:56:41,2019-04-25 15:36:02,0.0,...,31069,0,0,55.677168,12.580413,0.0,0.0,2019-03-26 09:45:41,102111,550062


In [33]:
df[df.CarID == 'WBY1Z21050V307993'].sort_values('Reservation_Time').iloc[-550:-540]
#df[df.CarID == 'WBY1Z21050V307993'].loc[1493665]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,...,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long,Start_Time,Start_Zone,End_Zone
1702579,Customer,3257292,WBY1Z21050V307993,I3,No,9400375651,Private,2019-02-04 18:27:14,2019-02-04 18:55:37,60.8,...,10,38,29,55.659379,12.630345,55.667792,12.545928,2019-02-04 18:37:14,103212,102821
1702760,Customer,2985859,WBY1Z21050V307993,I3,No,9400379233,Private,2019-02-04 19:16:57,2019-02-04 19:35:49,17.33,...,10,29,24,55.667792,12.545928,55.686779,12.536951,2019-02-04 19:26:57,102821,147161
1702938,Customer,828693,WBY1Z21050V307993,I3,No,9400382472,Private,2019-02-04 20:20:58,2019-02-04 20:51:08,64.0,...,11,5,100,55.686779,12.536951,55.654816,12.618666,2019-02-04 20:31:58,147161,103223
1783729,Non_Customer,1112124,WBY1Z21050V307993,I3,No,9402866073,Private,2019-02-05 00:23:35,2019-02-05 02:10:30,0.0,...,0,75,48,55.69442,12.550729,55.634472,12.648757,2019-02-05 00:23:35,102443,185121
1493665,Non_Customer,793639,WBY1Z21050V307993,I3,No,9407597629,Private,2019-02-05 14:44:54,2019-06-18 16:08:50,0.0,...,59009,0,0,55.634966,12.650418,0.0,0.0,2019-03-18 14:13:54,185121,550062
1113833,Customer,1015209,WBY1Z21050V307993,I3,No,9394090697,Private,2019-02-13 07:42:30,2019-02-13 08:13:58,22.53,...,19,58,51,55.713129,12.572044,55.701208,12.600449,2019-02-13 08:01:30,102324,102336
1114545,Customer,3219128,WBY1Z21050V307993,I3,No,9394109802,Business,2019-02-13 13:22:25,2019-02-13 13:58:53,80.0,...,2,51,39,55.701208,12.600449,55.63022,12.64893,2019-02-13 13:24:25,102336,185203
1114742,Customer,1053005,WBY1Z21050V307993,I3,No,9394115235,Private,2019-02-13 14:52:45,2019-02-13 15:23:13,43.07,...,4,39,19,55.63022,12.64893,55.679108,12.479413,2019-02-13 14:56:45,185203,102634
1115510,Customer,2658148,WBY1Z21050V307993,I3,No,9394130425,Private,2019-02-13 18:44:27,2019-02-13 19:24:02,51.2,...,8,19,10,55.679108,12.479413,55.674144,12.571492,2019-02-13 18:52:27,102634,102131
1116382,Customer,812023,WBY1Z21050V307993,I3,No,9394146782,Private,2019-02-14 05:20:48,2019-02-14 05:56:35,84.8,...,17,100,90,55.674144,12.571492,55.630231,12.64884,2019-02-14 05:37:48,102131,185203


## Start time

In [None]:
# Add start time based on Reservation minutes
df['Start_Time'] = [row.Reservation_Time+datetime.timedelta(minutes=row.Reservation_Minutes) for _, row in df.iterrows()]

# Create Vacancy

In [33]:
# Haversine function
def haversine(point1, point2):
    # convert decimal degrees to radians
    lat1, lon1 = map(np.radians, point1)
    lat2, lon2 = map(np.radians, point2)

    # Deltas
    delta_lon = lon2 - lon1 
    delta_lat = lat2 - lat1 
    
    # haversine formula 
    a = np.sin(delta_lat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(delta_lon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371000 # Radius of earth in m
    return c * r

In [34]:
df_sorted = df.sort_values("Reservation_Time")
df_sorted.CarID.nunique()

1021

In [35]:
data = []
for i, car in enumerate(df_sorted.CarID.unique()):
    if car == '-':
        continue
    car_sub_df = df_sorted[df_sorted.CarID == car]
    if not i%10:
        print(f'{i} cars processed')
    for (_, row1), (_, row2) in zip(car_sub_df[:-1].iterrows(),car_sub_df[1:].iterrows()):
        park_time = row1['End_Time']
        reservation_time = row2['Reservation_Time']
        start_time = row2['Start_Time']
        time_to_reservation = (row2['Reservation_Time']-row1['End_Time']).total_seconds()/3600
        time_to_start = (row2['Start_Time']-row1['End_Time']).total_seconds()/3600
        park_location_lat = row1['End_Lat']
        park_location_long = row1['End_Long']
        park_zone = row1['End_Zone']
        park_fuel = row1['Fuel_End']
        leave_fuel = row2['Fuel_Start']
        engine = row1['Engine']
        moved = haversine(row1.loc[['End_Lat','End_Long']].values, row2.loc[['Start_Lat','Start_Long']].values) 
        data.append([car, park_time,reservation_time, start_time, time_to_reservation, time_to_start, park_location_lat, park_location_long, park_zone, park_fuel, leave_fuel, engine, moved])

0 cars processed
10 cars processed
20 cars processed
30 cars processed
40 cars processed
50 cars processed
60 cars processed
70 cars processed
80 cars processed
90 cars processed
100 cars processed
110 cars processed
120 cars processed
130 cars processed
140 cars processed
150 cars processed
160 cars processed
170 cars processed
180 cars processed
190 cars processed
200 cars processed
210 cars processed
220 cars processed
230 cars processed
240 cars processed
250 cars processed
260 cars processed
270 cars processed
280 cars processed
290 cars processed
300 cars processed
310 cars processed
320 cars processed
330 cars processed
340 cars processed
350 cars processed
360 cars processed
370 cars processed
380 cars processed
390 cars processed
400 cars processed
410 cars processed
420 cars processed
430 cars processed
440 cars processed
450 cars processed
460 cars processed
470 cars processed
480 cars processed
490 cars processed
500 cars processed
510 cars processed
520 cars processed
530 

In [36]:
# Create new df
df_vacancy = pd.DataFrame(data = data, columns = ['car', 'park_time', 'reservation_time', 'start_time','time_to_reservation', 'time_to_start', 'park_location_lat', 'park_location_long', 'park_zone', 'park_fuel', 'leave_fuel', 'engine', 'moved'])

# Infer types
df_vacancy = df_vacancy.convert_dtypes()

# Save
df_vacancy.to_csv('data/processed/Vacancy_new.csv')

In [39]:
df_vacancy[df_vacancy.park_location_lat < 10]

Unnamed: 0,car,park_time,reservation_time,start_time,time_to_reservation,time_to_start,park_location_lat,park_location_long,park_zone,park_fuel,leave_fuel,engine,moved
6484,WBY1Z21010V307859,2019-09-23 16:36:47,2019-10-01 02:02:55,2019-10-01 02:04:55,177.435556,177.468889,0.0,0.0,550062,0,56,I3,6292768.460909
20262,WBY1Z21000V307884,2019-06-18 16:08:03,2019-07-01 06:25:50,2019-07-01 06:48:50,302.296389,302.679722,0.0,0.0,550062,0,100,I3,6295269.445084
25593,WBY1Z210X0V307858,2019-09-25 16:37:29,2019-10-01 16:50:39,2019-10-01 16:59:39,144.219444,144.369444,0.0,0.0,550062,0,77,I3,6291430.710595
31020,WBY1Z21030V308205,2019-01-10 16:39:24,2019-01-13 00:30:05,2019-01-13 00:40:05,55.844722,56.011389,0.0,0.0,550062,0,87,I3,6296310.222621
33636,WBY1Z21050V307976,2017-07-14 03:30:25,2017-08-01 15:15:53,2017-08-01 15:20:53,443.757778,443.841111,0.0,0.0,550062,98,59,I3,6295540.691409
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2449916,WBY8P2104K7D92193,2019-10-09 16:32:02,2019-11-05 06:58:03,2019-11-05 06:59:03,638.433611,638.450278,0.0,0.0,550062,0,62,I3 120,6291983.829059
2464979,WBY8P2104K7D89939,2019-08-29 16:28:31,2019-09-05 13:54:16,2019-09-05 13:56:16,165.429167,165.4625,0.0,0.0,550062,0,100,I3 120,6291369.414998
2552340,WBA1R5100K7D67738,2019-07-29 16:19:41,2019-08-06 00:31:12,2019-08-06 00:37:12,176.191944,176.291944,0.0,0.0,550062,0,94,118I,6298022.719552
2586727,WBA31AA05L3H87545,2019-05-12 12:30:42,2019-05-10 14:27:27,2019-05-10 14:42:27,-46.054167,-45.804167,0.0,0.0,550062,0,76,X1 SDRIVE18I,6294458.041067


## OSM

In [None]:
import osmnx

In [None]:
tmp = osmnx.geometries_from_place('Region Hovedstaden', {'railway': 'station'})

  aout[:] = out
  aout[:] = out


In [None]:
tmp.name.sort_values().values

array(['Aksel Møllers Have', 'Allerød', 'Amager Strand', 'Amagerbro',
       'Ballerup', 'Bella Center', 'Birkerød', 'Blovstrød H', 'Brandhøj',
       'Brødeskov', 'Buddinge', 'Christianshavn', 'DR Byen',
       'Dronningmølle', 'Dyssegård', 'Dyssekilde', 'Enghave Brygge',
       'Enghave Plads', 'Farum', 'Fasanvej', 'Femøren', 'Firhøj',
       'Flintholm', 'Flintholm', 'Forum', 'Fredensborg', 'Frederiksberg',
       'Frederiksberg Allé', 'Frederikssund', 'Frederiksværk', 'Fuglevad',
       'Gammel Strand', 'Gentofte', 'Gilleleje', 'Glostrup', 'Græsted',
       'Grønnehave', 'Gørløse', 'Hareskov', 'Havneholmen', 'Hedehusgård',
       'Hellebæk', 'Hellerup', 'Helsinge', 'Helsingør', 'Herlev',
       'Hillerød', 'Hillerød', 'Holte', 'Hornbæk', 'Hundested',
       'Hundested Havn', 'Høje Taastrup', 'Høje Taastrup', 'Højstrup',
       'Islands Brygge', 'Jægersborg', 'Kagerup', 'Kastrup',
       'Klampenborg', 'Klampenborg', 'Kongens Nytorv', 'Kregme',
       'København H', 'København H', '

In [None]:
tmp2 = osmnx.geometries_from_place('Region Hovedstaden', {'public_station': 'platform'})

  aout[:] = out


In [None]:
tmp2

Unnamed: 0,geometry
