In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import glob
import numpy as np
import networkx as nx
import tqdm
import datetime
import geopandas as gpd
import rtree

In [2]:
files = glob.glob("data/raw/SNData/*.csv")

dfs = []
for f in tqdm.tqdm(files):
    dfs.append(pd.read_csv(f, header=0, sep=";"))

Full_data = pd.concat(dfs,ignore_index=True) # Save this to interim
Full_data.to_csv('data/interim/Full_data.csv')

100%|██████████| 53/53 [00:07<00:00,  6.86it/s]


In [3]:
# Drop 53 rows with na values
df = Full_data.dropna()

# Rename Columns to English
df. columns = ['Customer_Group', 'CustomerID', 'CarID', 'Engine', 'Rental_flag', 'RentalID', 'Rental_Usage_Type', 'Reservation_Time', 'End_Time', 'Revenue', 'Distance', 'Drives', 'Reservation_Minutes','Fuel_Start','Fuel_End','Start_Lat', 'Start_Long', 'End_Lat', 'End_Long']

# Fix type
df = df.astype({'CustomerID': 'int32', 'RentalID': 'int64'})

# Drop drives as it has no info (only ones)
df.drop(columns = 'Drives', inplace=True)
df

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
0,Non_Customer,793639,WBY1Z21080V307924,I3,No,9335872135,Private,24.03.2016 11:48:43,02.04.2016 10:00:19,0.00,0,0,0,0,55.678763,12.552853,0.000000,0.000000
1,Non_Customer,1035973,WBY1Z21080V307857,I3,No,9336114126,Private,30.03.2016 15:37:39,01.04.2016 00:40:38,0.00,0,0,62,47,55.770626,12.519300,55.770389,12.518839
2,Non_Customer,998095,WBY1Z21020V307904,I3,No,9336153910,Private,31.03.2016 13:08:16,05.04.2016 08:32:25,0.00,2,1,85,79,55.621588,12.606951,55.621532,12.606279
3,Non_Customer,999604,WBY1Z21010V307926,I3,No,9336158303,Private,31.03.2016 14:43:00,01.04.2016 07:10:00,0.00,0,1,0,71,55.770077,12.518914,55.769746,12.519123
4,Non_Customer,1035969,WBY1Z21070V308210,,No,9336160465,Private,31.03.2016 15:21:36,01.04.2016 14:24:17,0.00,0,1,53,52,55.770623,12.519791,55.770439,12.518937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2633979,Customer,1070662,WBY1Z21010V308185,I3,No,9345011102,Private,30.09.2016 23:39:05,30.09.2016 23:50:54,5.16,4,10,46,41,55.694700,12.553776,55.678740,12.587144
2633980,Customer,1041705,WBY1Z21080V308250,I3,No,9345011139,Private,30.09.2016 23:42:18,30.09.2016 23:52:14,3.44,6,8,59,52,55.648401,12.542945,55.641310,12.615295
2633981,Customer,2112471,WBY1Z21020V308261,I3,No,9345011311,Private,30.09.2016 23:33:39,30.09.2016 23:52:03,8.17,9,3,39,30,55.664744,12.580875,55.719856,12.540863
2633982,Customer,440147,WBY1Z21060V307954,I3,Yes,9345011420,Private,30.09.2016 23:41:56,30.09.2016 23:57:30,6.88,9,4,44,35,55.710676,12.566043,55.667453,12.619987


In [4]:
# Remove all rows with a CarID as it can not be used
df = df[df.CarID != '0']

In [5]:
# Engine has two types of missing values that is alligned
df["Engine"].replace({" ": '0'}, inplace=True)

In [6]:
# If a CarID already has an engine type assign that to the missing ones
Engine_dict = {c: df[df.CarID == c].Engine.nunique() for c in df[df.Engine == '0'].CarID.unique()}
for car, engine in Engine_dict.items():
    if engine == 1:
        continue
    True_Engine = [x for x in df[df.CarID == car].Engine.unique() if x!= '0'][0]
    df.loc[(df.CarID == car) & (df.Engine == '0'), 'Engine'] = True_Engine

# Populate the rest manual based on ID
df.loc[(df.CarID == 'WBA1R5104J7B14310') & (df.Engine == '0'), 'Engine'] = '118I'
df.loc[(df.CarID == 'WBA1R5104J5K58061') & (df.Engine == '0'), 'Engine'] = '118I'
df.loc[(df.CarID == 'WBA1R5103K7D66678') & (df.Engine == '0'), 'Engine'] = '118I'
df.loc[(df.CarID == 'WBY8P2105K7D70350') & (df.Engine == '0'), 'Engine'] = 'I3 120'
df.loc[(df.CarID == 'WBY8P2102K7D70287') & (df.Engine == '0'), 'Engine'] = 'I3 120'

## Times

In [7]:
df['Reservation_Time'] = pd.to_datetime(df['Reservation_Time'], format="%d.%m.%Y %H:%M:%S")
df['End_Time'] = pd.to_datetime(df['End_Time'], format="%d.%m.%Y %H:%M:%S")

## Fix trips where same user use same car

In [8]:
# Split data on Car level
CarID_dict = dict(iter(df.groupby('CarID')))

def fix_merges(dataframe, max_time_diff = 60):
    dataframe = dataframe.sort_values(by = 'Reservation_Time')
    # Get index where same customer uses the same car back to back
    diff0_iloc = [dataframe.index.get_loc(x) for x in dataframe.index[(dataframe.CustomerID.diff() == 0).tolist()]]

    # Find paris to be merged
    merge_pairs = [(idx-1,idx) for idx in diff0_iloc if dataframe.iloc[idx-1].End_Time+pd.to_timedelta(max_time_diff+dataframe.iloc[idx].Reservation_Minutes,'m') > dataframe.iloc[idx].Reservation_Time]

    # Model as graph to get cc
    graph_model = nx.Graph(merge_pairs)
    groups = [(min(cc),max(cc)) for cc in list(nx.connected_components(graph_model))]

    # Populate 
    for pair in groups:
        dataframe.loc[dataframe.index[pair[0]],['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']] = dataframe.loc[dataframe.index[pair[1]],['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']]


    # Delete now unwanted rows
    rows_to_delete = [x[1] for x in merge_pairs]
    dataframe.drop(index = [dataframe.index[x] for x in rows_to_delete], inplace = True)

    # Return fixed dataframe
    return dataframe

# Merge new datasets
dfs = []
for sub_df in tqdm.tqdm(CarID_dict.values()):
    dfs.append(fix_merges(sub_df))

df = pd.concat(dfs,ignore_index=False).sort_values(by = 'RentalID')

100%|██████████| 1021/1021 [07:14<00:00,  2.35it/s]


In [9]:
df.to_csv('data/interim/first_version.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2439290 entries, 1969578 to 1457879
Data columns (total 18 columns):
 #   Column               Dtype         
---  ------               -----         
 0   Customer_Group       object        
 1   CustomerID           int32         
 2   CarID                object        
 3   Engine               object        
 4   Rental_flag          object        
 5   RentalID             int64         
 6   Rental_Usage_Type    object        
 7   Reservation_Time     datetime64[ns]
 8   End_Time             datetime64[ns]
 9   Revenue              float64       
 10  Distance             int64         
 11  Reservation_Minutes  int64         
 12  Fuel_Start           int64         
 13  Fuel_End             int64         
 14  Start_Lat            float64       
 15  Start_Long           float64       
 16  End_Lat              float64       
 17  End_Long             float64       
dtypes: datetime64[ns](2), float64(5), int32(1), int64(5), ob

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import glob
import numpy as np
import networkx as nx
import tqdm
import datetime
import geopandas as gpd
import rtree

df = pd.read_csv('data/interim/first_version.csv', index_col = 0)

In [2]:
df['Reservation_Time'] = pd.to_datetime(df['Reservation_Time'], format="%Y.%m.%d %H:%M:%S")
df['End_Time'] = pd.to_datetime(df['End_Time'], format="%Y.%m.%d %H:%M:%S")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2439290 entries, 1969578 to 1457879
Data columns (total 18 columns):
 #   Column               Dtype         
---  ------               -----         
 0   Customer_Group       object        
 1   CustomerID           int64         
 2   CarID                object        
 3   Engine               object        
 4   Rental_flag          object        
 5   RentalID             int64         
 6   Rental_Usage_Type    object        
 7   Reservation_Time     datetime64[ns]
 8   End_Time             datetime64[ns]
 9   Revenue              float64       
 10  Distance             int64         
 11  Reservation_Minutes  int64         
 12  Fuel_Start           int64         
 13  Fuel_End             int64         
 14  Start_Lat            float64       
 15  Start_Long           float64       
 16  End_Lat              float64       
 17  End_Long             float64       
dtypes: datetime64[ns](2), float64(5), int64(6), object(5)
me

## Fix 0,0 locations

We also accept the other ones outside Copenhagen as the cars must have been there. They can be removed in the vacancy dataset

In [9]:
for i, row in df[(df.Start_Lat < 5)].iterrows():
    # Skip if first instance as it will unaffect vacancy
    sub_df = df[df.CarID == row.CarID].sort_values('RentalID')
    err_index = sub_df.index.get_loc(i)
    if err_index == 0:
        continue

    # Populate based on previous end 
    df.loc[i, ['Start_Lat', 'Start_Long']] = sub_df.iloc[err_index-1].loc[['End_Lat','End_Long']].values

In [10]:
for i, row in df[(df.End_Lat < 5)].iterrows():
    sub_df = df[df.CarID == row.CarID].sort_values('RentalID')
    err_index = sub_df.index.get_loc(i)

    # Will fail if last index
    try:
        df.loc[i, ['End_Lat', 'End_Long']] = sub_df.iloc[err_index+1].loc[['Start_Lat','Start_Long']].values
    except:
        continue
    

## Add zones

In [11]:
# Load shapefile and set projection
shapefile = gpd.read_file("../Zonekort/LTM_Zone3/zones_level3.shp")
shapefile = shapefile.to_crs(epsg=4326)

In [12]:
# Create a geoDF with geometry as starting point
gdf_start = gpd.GeoDataFrame(df, geometry= gpd.points_from_xy(df.Start_Long, df.Start_Lat))

# Set projection
gdf_start = gdf_start.set_crs(epsg=4326)

In [13]:
# Populate zones based on which zone they are within
gdpj_start  = gpd.sjoin(gdf_start, shapefile, op='within')
df['Start_Zone'] = gdpj_start.zoneid

In [14]:
# Populate the rest based on which zone they are closest too
Start_zone_filler = {x: shapefile.zoneid[shapefile.distance(df.loc[x].geometry).sort_values().index[0]] for x in df.index[df['Start_Zone'].isna()]}
df['Start_Zone'] = df['Start_Zone'].fillna(Start_zone_filler)


  Start_zone_filler = {x: shapefile.zoneid[shapefile.distance(df.loc[x].geometry).sort_values().index[0]] for x in df.index[df['Start_Zone'].isna()]}


In [15]:
# Create a geoDF with geometry as end point
gdf_end = gpd.GeoDataFrame(df, geometry= gpd.points_from_xy(df.End_Long, df.End_Lat))

# Set projection
gdf_end = gdf_end.set_crs(epsg=4326)

In [16]:
# Populate zones based on which zone they are within
gdpj_end  = gpd.sjoin(gdf_end, shapefile, op='within')
df['End_Zone'] = gdpj_end.zoneid

In [17]:
# Populate the rest based on which zone they are closest too
End_zone_filler = {x: shapefile.zoneid[shapefile.distance(df.loc[x].geometry).sort_values().index[0]] for x in df.index[df['End_Zone'].isna()]}
df['End_Zone'] = df['End_Zone'].fillna(End_zone_filler)


  End_zone_filler = {x: shapefile.zoneid[shapefile.distance(df.loc[x].geometry).sort_values().index[0]] for x in df.index[df['End_Zone'].isna()]}


In [18]:
# Remove geomery type and make IDs int columns
df.drop(columns = 'geometry', inplace = True)
df = df.astype({'CustomerID': 'int32', 'RentalID': 'int64', 'Start_Zone': 'int32','End_Zone': 'int32'})

In [19]:
# Check types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2439637 entries, 0 to 2439636
Data columns (total 20 columns):
 #   Column               Dtype         
---  ------               -----         
 0   Customer_Group       object        
 1   CustomerID           int32         
 2   CarID                object        
 3   Engine               object        
 4   Rental_flag          object        
 5   RentalID             int64         
 6   Rental_Usage_Type    object        
 7   Reservation_Time     datetime64[ns]
 8   End_Time             datetime64[ns]
 9   Revenue              float64       
 10  Distance             int64         
 11  Reservation_Minutes  int64         
 12  Fuel_Start           int64         
 13  Fuel_End             int64         
 14  Start_Lat            float64       
 15  Start_Long           float64       
 16  End_Lat              float64       
 17  End_Long             float64       
 18  Start_Zone           int32         
 19  End_Zone             

In [20]:
# Sweden and Bornholm
#df[df.Start_Long > 13].sort_values(by = 'Reservation_Time')

# Jutland
#df[(df.Start_Long < 11) & (df.Start_Long > 0) & (df.Customer_Group == 'Customer')]

# Car in Germany in the middle of the data..
#df[df.CarID == 'WBY1Z21040V308181'].sort_values(by = 'Reservation_Time').iloc[-30:-20]

## Weird times

In [18]:
# Winter Time
WinterTimeIndex = df[(df.Reservation_Time > df.End_Time) & (df.End_Time.apply(lambda x: x.month) == 10) & (df.End_Time.apply(lambda x: x.hour) < 4)].index
WinterTimeIndexBack = [2179859, 1683947, 1683948]
WinterTimeIndexForward = [x for x in WinterTimeIndex if x not in WinterTimeIndexBack]
df.loc[WinterTimeIndexBack, 'Reservation_Time'] = df.loc[WinterTimeIndexBack, 'Reservation_Time'] - pd.to_timedelta(1,'h')
df.loc[WinterTimeIndexForward, 'End_Time'] = df.loc[WinterTimeIndexForward, 'End_Time'] + pd.to_timedelta(1,'h')

In [19]:
# Remove remaining 50 observations as they will not introduce more vacancy time
df.drop(index = df[df.Reservation_Time > df.End_Time].index, inplace = True)

### What about start-time

In [123]:
testdf = df.copy(deep=True)
testdf['Start_Time'] = [row.Reservation_Time+datetime.timedelta(minutes=row.Reservation_Minutes) for _, row in testdf.iterrows()]

In [214]:
testdf[testdf.Start_Time-pd.to_timedelta(1,'m') > testdf.End_Time].sort_values(by = 'RentalID') # Using Reservation Time in the data should eleminate the risk of having trips at the same time

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long,Start_Time
1969600,Non_Customer,793664.0,WBY1Z21030V308138,I3,No,9.326529e+09,Private,2015-08-19 14:52:45,2015-08-19 14:53:35,0.0,0,4,97,97,55.437567,11.814045,55.437079,11.816411,2015-08-19 14:56:45
1969601,Customer,790044.0,WBY1Z21010V307960,I3,No,9.326529e+09,Private,2015-08-19 15:12:24,2015-08-19 15:12:53,0.0,0,4,68,68,55.671158,12.583635,55.671159,12.583636,2015-08-19 15:16:24
1969652,Non_Customer,793664.0,WBY1Z21070V308224,I3,No,9.326806e+09,Private,2015-08-26 09:45:51,2015-08-26 09:48:27,0.0,0,6,85,84,55.691981,12.619608,55.691980,12.619609,2015-08-26 09:51:51
1969666,Non_Customer,793664.0,WBY1Z21090V308256,I3,No,9.326809e+09,Private,2015-08-26 10:37:22,2015-08-26 10:38:15,0.0,0,5,78,78,55.692095,12.619449,55.692095,12.619450,2015-08-26 10:42:22
1969680,Non_Customer,793664.0,WBY1Z21040V308214,I3,No,9.326815e+09,Private,2015-08-26 13:01:52,2015-08-26 13:06:38,0.0,0,9,85,85,55.692176,12.619752,55.691739,12.619268,2015-08-26 13:10:52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1411464,Non_Customer,793639.0,WMWXU7106KTM91733,COOPER,No,9.430461e+09,Private,2019-12-14 12:18:02,2019-12-14 13:33:37,0.0,18,150,62,52,55.706162,12.534806,55.630225,12.648690,2019-12-14 14:48:02
1415068,Non_Customer,793639.0,WBY8P2107K7E72622,I3 120,No,9.430505e+09,Private,2019-12-15 14:21:42,2019-12-15 14:51:06,0.0,18,116,94,85,55.622686,12.615552,55.664729,12.540505,2019-12-15 16:17:42
1418806,Non_Customer,793639.0,WMWXR3102KTK54716,COOPER,No,9.430574e+09,Private,2019-12-16 18:45:19,2019-12-16 20:37:48,0.0,21,130,90,81,55.714542,12.564363,55.630140,12.649162,2019-12-16 20:55:19
1425507,Non_Customer,793639.0,WBA31AA04L3L04754,X1 SDRIVE18I,No,9.430690e+09,Private,2019-12-19 06:30:43,2019-12-19 08:07:53,0.0,0,101,98,98,55.634405,12.649611,55.634405,12.649611,2019-12-19 08:11:43


In [125]:
testdf_sorted = testdf.sort_values(by = 'RentalID').sort_values(by = ['CarID','RentalID'])

In [81]:
bad_idx = testdf[testdf.Start_Time > testdf.End_Time].sort_values(by = 'RentalID').index
bad_idx

Int64Index([1969584, 1969593, 1969597, 1969598, 1969600, 1969601, 1969649,
            1969652, 1969666, 1969680,
            ...
            1447696, 1449028, 1449260, 1453217, 1454055, 1454153, 1456301,
            1456890, 1457588, 1457729],
           dtype='int64', length=38612)

In [93]:
bad_iloc_idx = [testdf_sorted.index.get_loc(x) for x in bad_idx]
bad_iloc_idx[1000]

425716

In [94]:
testdf_sorted.iloc[(425716-1):(425716+2)]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long,Start_Time
1598740,Customer,824476.0,WBY1Z21000V308131,I3,No,9327820000.0,Private,2015-09-17 21:58:19,2015-09-17 22:13:25,6.78,3,8,99,95,55.673198,12.593009,55.675069,12.547551,2015-09-17 22:06:19
1599000,Customer,825449.0,WBY1Z21000V308131,I3,No,9327845000.0,Private,2015-09-18 12:24:54,2015-09-18 12:32:15,3.39,2,18,95,94,55.675069,12.547551,55.667909,12.542775,2015-09-18 12:42:54
1599991,Customer,812557.0,WBY1Z21000V308131,I3,No,9327903000.0,Private,2015-09-19 15:20:39,2015-09-19 15:36:15,6.78,4,0,94,89,55.667909,12.542775,55.677641,12.582207,2015-09-19 15:20:39


### Trips longer than 5 days

In [67]:
df.loc[1122794, 'Reservation_Time'] = pd.Timestamp('2019-02-26 09:02:56')

In [58]:
tmp2 = df[df.Reservation_Time+pd.to_timedelta(5,'d') < df.End_Time].sort_values(by = 'RentalID')
#tmp2[tmp2.Customer_Group == 'Customer'] # 1152167 should be 02-26 as reservation. Rest is good even though 601154 has a very close follow up
tmp2[tmp2.Customer_Group == 'Non_Customer']

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
1969590,Non_Customer,790031,WBY1Z21090V307818,I3,No,9326441577,Private,2015-08-17 13:59:14,2015-08-27 14:48:58,0.0,334,0,16,99,55.726729,12.582682,55.671036,12.583519
1591238,Non_Customer,808838,WBY1Z210X0V307780,I3,No,9327127066,Private,2015-09-02 16:28:12,2015-09-29 09:57:05,0.0,351,0,3,99,55.706360,12.529268,55.737039,12.477291
1593151,Non_Customer,817026,WBY1Z21060V308053,I3,No,9327351199,Private,2015-09-07 17:30:34,2015-09-24 14:08:26,0.0,581,0,19,99,55.674417,12.560560,55.676246,12.559132
1603325,Non_Customer,817026,WBY1Z21030V307927,I3,No,9328124511,Private,2015-09-24 14:37:50,2015-09-30 11:47:42,0.0,266,0,87,55,55.676015,12.559150,55.630216,12.650906
706352,Non_Customer,819723,WBY1Z21010V307859,I3,No,9328432815,Private,2015-09-25 12:55:42,2015-10-01 10:07:13,0.0,0,0,17,0,55.652858,12.613396,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1878420,Non_Customer,893118,WBY8P2101K7E71739,I3 120,No,9427740041,Private,2019-10-28 07:42:03,2019-11-04 14:19:04,0.0,1,2,59,55,55.621566,12.606274,55.621597,12.606271
1688993,Non_Customer,793639,WBY1Z21090V308094,I3,No,9427800504,Private,2019-10-07 03:58:33,2019-10-29 16:50:51,0.0,0,31834,5,0,55.716240,12.567010,0.000000,0.000000
1689528,Non_Customer,793639,WBY1Z21020V308275,I3,No,9427811091,Private,2019-10-21 03:19:49,2019-10-29 16:50:50,0.0,0,11899,0,0,55.716240,12.567011,0.000000,0.000000
1935517,Non_Customer,793639,WBY8P2105K7D87701,I3 120,No,9429062229,Private,2019-11-12 11:22:34,2019-11-20 16:59:58,0.0,0,11361,0,0,55.663175,12.586320,0.000000,0.000000


In [57]:
df[df.CarID =='WBA31AA08L3H87541'].iloc[:30]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
592155,Non_Customer,1124378,WBA31AA08L3H87541,X1 SDRIVE18I,No,9422140722,Private,2019-09-20 12:22:28,2019-09-20 12:26:24,0.0,0,1,0,0,55.7167,12.566749,55.716238,12.567008
599305,Non_Customer,793639,WBA31AA08L3H87541,X1 SDRIVE18I,No,9422392769,Private,2019-09-22 17:00:44,2019-09-22 17:58:45,0.0,1,35,0,79,55.716238,12.567008,55.716257,12.566997
600108,Non_Customer,1109469,WBA31AA08L3H87541,X1 SDRIVE18I,No,9422408705,Private,2019-09-22 17:58:48,2019-09-23 03:06:18,0.0,8,524,94,86,55.716238,12.567008,55.685799,12.586786
600187,Customer,2331993,WBA31AA08L3H87541,X1 SDRIVE18I,No,9422410926,Business,2019-09-23 05:45:45,2019-09-23 06:41:38,96.0,16,36,86,84,55.685799,12.586786,55.630151,12.648795
600669,Customer,800699,WBA31AA08L3H87541,X1 SDRIVE18I,No,9422425504,Private,2019-09-23 09:32:09,2019-09-23 09:53:32,82.4,9,1,84,84,55.630151,12.648795,55.671314,12.573612
601154,Customer,3459231,WBA31AA08L3H87541,X1 SDRIVE18I,No,9422465124,Private,2019-09-23 12:26:59,2019-09-30 09:59:38,8079.8,2745,21,8,79,55.671314,12.573612,55.667254,12.421361
622215,Non_Customer,793639,WBA31AA08L3H87541,X1 SDRIVE18I,No,9424320729,Private,2019-09-30 09:59:44,2019-09-30 15:20:50,0.0,12,242,72,64,55.667254,12.421361,55.65042,12.506053
622907,Customer,3590591,WBA31AA08L3H87541,X1 SDRIVE18I,No,9424490161,Private,2019-09-30 17:40:26,2019-09-30 18:00:16,40.8,18,4,64,60,55.65042,12.506053,55.724318,12.386482
1607796,Customer,3259613,WBA31AA08L3H87541,X1 SDRIVE18I,No,9424510390,Private,2019-10-01 03:29:40,2019-10-01 04:14:54,96.0,36,14,62,56,55.724318,12.386482,55.630218,12.649082
1607843,Non_Customer,1124418,WBA31AA08L3H87541,X1 SDRIVE18I,No,9424511021,Private,2019-10-01 05:02:25,2019-10-01 05:36:55,0.0,36,3,43,84,55.630218,12.649082,55.780319,12.520276


In [148]:
subdf = df[df.CarID == 'WBY8P2101K7E71739'].sort_values(by = 'RentalID')
tidx = subdf.index.get_loc(1878420)
subdf.iloc[(tidx-5):(tidx+5)]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
1683723,Non_Customer,1109469.0,WBY8P2101K7E71739,I3 120,No,9427687000.0,Private,2019-10-26 21:34:23,2019-10-26 23:31:17,0.0,1,112,38,82,55.645999,12.640459,55.6568,12.636684
1683912,Customer,838748.0,WBY8P2101K7E71739,I3 120,No,9427691000.0,Private,2019-10-27 01:23:33,2019-10-27 01:46:40,30.4,9,5,82,79,55.6568,12.636684,55.672651,12.543489
1684274,Customer,854489.0,WBY8P2101K7E71739,I3 120,No,9427697000.0,Private,2019-10-27 09:24:26,2019-10-27 12:11:03,240.0,42,17,79,64,55.672651,12.543489,55.672197,12.542612
1684914,Customer,2115086.0,WBY8P2101K7E71739,I3 120,No,9427708000.0,Private,2019-10-27 13:11:30,2019-10-27 13:24:01,16.13,1,6,64,63,55.672197,12.542612,55.676154,12.560266
1685315,Non_Customer,793639.0,WBY8P2101K7E71739,I3 120,No,9427714000.0,Private,2019-10-27 13:29:05,2019-10-27 15:54:13,0.0,9,126,63,59,55.676154,12.560266,55.621566,12.606274
1878420,Non_Customer,893118.0,WBY8P2101K7E71739,I3 120,No,9427740000.0,Private,2019-10-28 07:42:03,2019-11-04 14:19:04,0.0,1,2,59,55,55.621566,12.606274,55.621597,12.606271
1890748,Non_Customer,793639.0,WBY8P2101K7E71739,I3 120,No,9428188000.0,Private,2019-11-04 17:39:43,2019-11-04 19:24:13,0.0,4,94,54,51,55.621597,12.606271,55.630185,12.648978
1890850,Customer,3640148.0,WBY8P2101K7E71739,I3 120,No,9428190000.0,Private,2019-11-04 19:53:00,2019-11-04 20:24:37,17.56,39,1,51,33,55.630185,12.648978,55.718748,12.554637
1891356,Non_Customer,1109469.0,WBY8P2101K7E71739,I3 120,No,9428199000.0,Private,2019-11-04 21:41:14,2019-11-05 00:47:34,0.0,2,179,31,100,55.718748,12.554637,55.708983,12.550849
1891589,Customer,2288309.0,WBY8P2101K7E71739,I3 120,No,9428204000.0,Private,2019-11-05 07:09:55,2019-11-05 07:48:34,4.18,14,13,100,94,55.708983,12.550849,55.73331,12.444929


In [179]:
subdf = df[df.CarID == 'WBY1Z21060V308179'].sort_values(by = 'RentalID')
tidx = subdf.index.get_loc(1496325)
subdf.iloc[(tidx-3):(tidx+3)]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
1799288,Customer,2476032.0,WBY1Z21060V308179,I3,No,9403198000.0,Private,2019-05-06 16:25:58,2019-05-06 16:39:34,41.6,5,1,86,81,55.675273,12.583606,55.650445,12.552424
1799467,Customer,2852842.0,WBY1Z21060V308179,I3,No,9403201000.0,Private,2019-05-06 17:01:06,2019-05-06 17:48:21,69.6,8,18,81,74,55.650445,12.552424,55.648361,12.469846
1800666,Non_Customer,1109469.0,WBY1Z21060V308179,I3,No,9403225000.0,Private,2019-05-06 20:38:41,2019-05-07 04:15:36,0.0,18,417,66,79,55.648361,12.469846,55.634538,12.648752
1496325,Non_Customer,793639.0,WBY1Z21060V308179,I3,No,9407713000.0,0,2019-06-13 11:24:22,2019-06-18 16:08:28,0.0,0,0,0,0,55.62993,12.650375,0.0,0.0
1496324,Non_Customer,793639.0,WBY1Z21060V308179,I3,No,9407713000.0,Private,2019-06-13 11:24:18,2019-06-18 16:08:28,0.0,0,0,0,0,55.62993,12.650375,0.0,0.0


## Merge Non_Customer

In [None]:
# Split data on Car level
CarID_dict = dict(iter(df.groupby('CarID')))

def fix_merges(dataframe, max_time_diff = 60):
    dataframe = dataframe.sort_values(by = 'Reservation_Time')
    # Get index where same customer uses the same car back to back
    diff0_iloc = [dataframe.index.get_loc(x) for x in dataframe.index[(dataframe.CustomerID.diff() == 0).tolist()]]

    # Find paris to be merged
    merge_pairs = [(idx-1,idx) for idx in diff0_iloc if dataframe.iloc[idx-1].End_Time+pd.to_timedelta(max_time_diff+dataframe.iloc[idx].Reservation_Minutes,'m') > dataframe.iloc[idx].Reservation_Time]

    # Model as graph to get cc
    graph_model = nx.Graph(merge_pairs)
    groups = [(min(cc),max(cc)) for cc in list(nx.connected_components(graph_model))]

    # Populate 
    for pair in groups:
        dataframe.loc[dataframe.index[pair[0]],['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']] = dataframe.loc[dataframe.index[pair[1]],['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']]


    # Delete now unwanted rows
    rows_to_delete = [x[1] for x in merge_pairs]
    dataframe.drop(index = [dataframe.index[x] for x in rows_to_delete], inplace = True)

    # Return fixed dataframe
    return dataframe

# Merge new datasets
dfs = []
for sub_df in tqdm.tqdm(CarID_dict.values()):
    dfs.append(fix_merges(sub_df))

df = pd.concat(dfs,ignore_index=False).sort_values(by = 'RentalID')

In [32]:
CarID_dict = dict(iter(df.groupby('CarID')))
dataframe = CarID_dict['WBY1Z21050V308092'].sort_values(by = 'Reservation_Time')

In [33]:
dataframe.Customer_Group

1969746    Non_Customer
1591353    Non_Customer
1591500        Customer
1591602        Customer
1591706    Non_Customer
               ...     
1806093        Customer
1806199        Customer
1809931    Non_Customer
1806747    Non_Customer
1499091    Non_Customer
Name: Customer_Group, Length: 4167, dtype: object

In [3]:
3+3

6

### Overlap

In [20]:
print(len(df))

2439240


In [42]:
print(len(df))
# Split data on Car level
CarID_dict = dict(iter(df.groupby('CarID')))

def remove_full_overlaps(dataframe):
    dataframe = dataframe.sort_values(by = 'Reservation_Time')

    # Get delta
    s = dataframe.End_Time.diff().sort_values() < pd.Timedelta(0,'s')

    # To be dropped
    drop_idx = list(s[s].index)

    # Remove those
    dataframe.drop(index = drop_idx, inplace = True)

    return dataframe

# Merge new datasets
dfs = []
for sub_df in CarID_dict.values():
    dfs.append(remove_full_overlaps(sub_df))

df = pd.concat(dfs,ignore_index=False)#.sort_values(by = 'RentalID')
print(len(df))

2433887
2433853


In [28]:
CarID_dict = dict(iter(df.groupby('CarID')))
dataframe = CarID_dict['WBY1Z21050V308092'].sort_values(by = 'Reservation_Time')
s = dataframe.End_Time.diff().sort_values() < pd.Timedelta(0,'s')
drop_idx = s[s].index
drop_idx

Int64Index([1806747, 2367204, 2394087, 786807], dtype='int64')

In [29]:
dataframe.index.get_loc(2394322)

2580

In [31]:
dataframe.iloc[2575:2585]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
2610555,Customer,870565,WBY1Z21050V308092,I3,No,9380483408,Private,2018-06-30 16:08:48,2018-06-30 16:27:41,17.6,6,9,45,40,55.633761,12.607331,55.664944,12.625753
2610698,Customer,2447593,WBY1Z21050V308092,I3,No,9380486259,Private,2018-06-30 17:06:41,2018-06-30 17:20:31,22.4,2,7,40,37,55.664944,12.625753,55.654892,12.609471
2611070,Customer,2933673,WBY1Z21050V308092,I3,No,9380493411,Private,2018-06-30 19:01:33,2018-06-30 19:16:16,3.46,0,13,36,36,55.654892,12.609471,55.654922,12.609469
2396281,Non_Customer,1109469,WBY1Z21050V308092,I3,No,9380558479,Private,2018-06-30 20:49:38,2018-07-02 02:13:42,0.0,0,5,0,0,55.654922,12.609469,0.0,0.0
2394087,Non_Customer,1115900,WBY1Z21050V308092,I3,No,9380508298,Private,2018-07-01 01:47:01,2018-07-01 02:34:56,0.0,5,1,36,94,55.654922,12.609469,55.671113,12.564826
2394322,Customer,2886387,WBY1Z21050V308092,I3,Yes,9380512569,Private,2018-07-01 08:15:09,2018-07-01 08:37:51,78.4,13,6,94,83,55.671113,12.564826,55.630121,12.649174
2394455,Customer,2002000,WBY1Z21050V308092,I3,No,9380517046,Private,2018-07-01 10:09:44,2018-07-01 10:21:18,59.2,9,1,82,74,55.630121,12.649174,55.641956,12.554862
2394578,Customer,3006810,WBY1Z21050V308092,I3,No,9380521274,Private,2018-07-01 11:13:36,2018-07-01 11:43:24,54.4,10,12,74,62,55.641956,12.554862,55.630548,12.649464
2395968,Customer,802309,WBY1Z21050V308092,I3,No,9380552709,Private,2018-07-01 21:25:35,2018-07-01 22:13:53,78.4,14,14,61,49,55.630548,12.649464,55.696695,12.594223
2396028,Customer,896843,WBY1Z21050V308092,I3,No,9380554811,Private,2018-07-01 22:45:49,2018-07-01 23:02:27,26.4,3,7,48,45,55.696695,12.594223,55.691826,12.563208


In [27]:
new = dataframe.drop(index = drop_idx, inplace = False)
s = new.End_Time.diff().sort_values() < pd.Timedelta(0,'s')
drop_idx = s[s].index
drop_idx

Int64Index([2394322, 786957, 2368291], dtype='int64')

In [22]:
dataframe.index.get_loc(2394322)

2578

In [23]:
dataframe.iloc[2575:2581]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
2610698,Customer,2447593,WBY1Z21050V308092,I3,No,9380486259,Private,2018-06-30 17:06:41,2018-06-30 17:20:31,22.4,2,7,40,37,55.664944,12.625753,55.654892,12.609471
2611070,Customer,2933673,WBY1Z21050V308092,I3,No,9380493411,Private,2018-06-30 19:01:33,2018-06-30 19:16:16,3.46,0,13,36,36,55.654892,12.609471,55.654922,12.609469
2396281,Non_Customer,1109469,WBY1Z21050V308092,I3,No,9380558479,Private,2018-06-30 20:49:38,2018-07-02 02:13:42,0.0,0,5,0,0,55.654922,12.609469,0.0,0.0
2394322,Customer,2886387,WBY1Z21050V308092,I3,Yes,9380512569,Private,2018-07-01 08:15:09,2018-07-01 08:37:51,78.4,13,6,94,83,55.671113,12.564826,55.630121,12.649174
2394455,Customer,2002000,WBY1Z21050V308092,I3,No,9380517046,Private,2018-07-01 10:09:44,2018-07-01 10:21:18,59.2,9,1,82,74,55.630121,12.649174,55.641956,12.554862
2394578,Customer,3006810,WBY1Z21050V308092,I3,No,9380521274,Private,2018-07-01 11:13:36,2018-07-01 11:43:24,54.4,10,12,74,62,55.641956,12.554862,55.630548,12.649464


In [19]:
CarID_dict = dict(iter(df.groupby('CarID')))
tat = []
endtat0 = []
endtat1 = []
endtat2 = []

for car,dataf in CarID_dict.items():
    dataf = dataf.sort_values(by = 'Reservation_Time')
    tap = list( zip( dataf.iloc[np.where(dataf.Reservation_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]].Customer_Group.values, dataf.iloc[np.where(dataf.Reservation_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]+1].Customer_Group.values ) )
    tat.extend( tap )
    if (('Customer', 'Non_Customer') in tap):
        print(car)

    endtat0.extend( dataf.iloc[np.where(dataf.Reservation_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]].index )
    endtat1.extend( dataf.iloc[np.where(dataf.Reservation_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]].Customer_Group )
    endtat2.extend( dataf.iloc[np.where(dataf.Reservation_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]].End_Lat )

pd.Series(tat).value_counts()

WBA1R5100J7B13946
WBA1R5101J5K57675
WBA1R5103J7B13388
WBA1R5104J5K57962
WBA1R5107J5K57440
WBY1Z21000V307934
WBY1Z21010V307912
WBY1Z21020V307871
WBY1Z21020V308132
WBY1Z21020V308177
WBY1Z21030V308270
WBY1Z21040V308021
WBY1Z21050V308092
WBY1Z21070V307946
WBY1Z21080V308054
WBY1Z21080V308135
WBY1Z21080V308233
WBY8P2102K7D87722
WBY8P2106K7D94138
WBY8P2109K7D77219
WBY8P2109K7E08744
WBY8P210XK7D70344
WMWXU710XKTM91735


(Non_Customer, Customer)        862
(Non_Customer, Non_Customer)    142
(Customer, Customer)             24
(Customer, Non_Customer)         23
dtype: int64

In [None]:
(Non_Customer, Non_Customer)    904
(Non_Customer, Customer)        169
(Customer, Customer)             26
(Customer, Non_Customer)         25

In [288]:
tmptmp = pd.DataFrame(data=[endtat0,endtat1,endtat2]).T
tmptmp[(tmptmp[1] == 'Non_Customer') & (tmptmp[2] > 1)]

Unnamed: 0,0,1,2
98,1605626,Non_Customer,55.674377
126,22088,Non_Customer,55.676731
129,414036,Non_Customer,55.630285
225,25915,Non_Customer,55.663199
267,661513,Non_Customer,55.648002
304,1237898,Non_Customer,55.648814
346,786692,Non_Customer,55.698582
368,817724,Non_Customer,55.648787
369,854799,Non_Customer,55.684344
370,854811,Non_Customer,55.664035


In [289]:
df.loc[1605626]

Customer_Group                Non_Customer
CustomerID                          793639
CarID                    WBY1Z21000V308050
Engine                                  I3
Rental_flag                             No
RentalID                        9328258295
Rental_Usage_Type                  Private
Reservation_Time       2015-09-27 11:43:11
End_Time               2015-09-27 15:34:44
Revenue                                0.0
Distance                                10
Reservation_Minutes                      0
Fuel_Start                              77
Fuel_End                                67
Start_Lat                        55.665622
Start_Long                       12.549845
End_Lat                          55.674377
End_Long                         12.575388
Name: 1605626, dtype: object

In [290]:
dataf = CarID_dict['WBY1Z21000V308050'].sort_values(by = 'Reservation_Time')

In [291]:
dataf.iloc[np.where(dataf.Reservation_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
1605626,Non_Customer,793639,WBY1Z21000V308050,I3,No,9328258295,Private,2015-09-27 11:43:11,2015-09-27 15:34:44,0.0,10,0,77,67,55.665622,12.549845,55.674377,12.575388
1709291,Non_Customer,1109469,WBY1Z21000V308050,I3,No,9400905360,Private,2019-04-03 21:09:44,2019-04-05 02:36:47,0.0,0,5,0,0,55.708753,12.564173,0.0,0.0


In [292]:
dataf.iloc[np.where(dataf.Reservation_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]+1]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
1605412,Non_Customer,832939,WBY1Z21000V308050,I3,No,9328250872,Private,2015-09-27 11:43:17,2015-09-27 14:24:21,0.0,0,1,77,0,55.665622,12.549845,0.0,0.0
1706350,Non_Customer,1120958,WBY1Z21000V308050,I3,No,9400554120,Private,2019-04-04 02:09:00,2019-04-04 02:15:37,0.0,1,1,26,99,55.708753,12.564173,55.708206,12.575129


In [295]:
dataf[dataf.RentalID > 9328200000].sort_values(by = 'Reservation_Time').iloc[:10]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
1604883,Customer,843784,WBY1Z21000V308050,I3,No,9328226349,Private,2015-09-26 16:16:21,2015-09-26 16:45:00,9.8,2,4,93,89,55.666312,12.509664,55.666355,12.509661
1605160,Customer,823739,WBY1Z21000V308050,I3,No,9328235142,Private,2015-09-26 20:45:06,2015-09-26 22:18:14,0.0,6,2,89,77,55.666355,12.509661,55.665622,12.549845
1605626,Non_Customer,793639,WBY1Z21000V308050,I3,No,9328258295,Private,2015-09-27 11:43:11,2015-09-27 15:34:44,0.0,10,0,77,67,55.665622,12.549845,55.674377,12.575388
1605412,Non_Customer,832939,WBY1Z21000V308050,I3,No,9328250872,Private,2015-09-27 11:43:17,2015-09-27 14:24:21,0.0,0,1,77,0,55.665622,12.549845,0.0,0.0
1605708,Customer,825381,WBY1Z21000V308050,I3,No,9328261209,Private,2015-09-27 17:08:16,2015-09-27 17:43:18,16.32,12,8,67,60,55.674377,12.575388,55.665616,12.536992
1606243,Customer,826273,WBY1Z21000V308050,I3,No,9328308193,Private,2015-09-28 16:11:34,2015-09-28 16:22:08,4.66,2,9,59,56,55.665607,12.536987,55.683574,12.53202
1606444,Customer,825606,WBY1Z21000V308050,I3,No,9328322735,Private,2015-09-28 21:51:06,2015-09-28 22:29:58,16.52,12,10,56,44,55.683574,12.53202,55.711597,12.577014
1606715,Customer,802577,WBY1Z21000V308050,I3,No,9328353555,Private,2015-09-29 15:35:08,2015-09-29 16:39:03,9.32,4,5,44,36,55.711597,12.577014,55.708408,12.573179
1607414,Customer,816606,WBY1Z21000V308050,I3,No,9328402714,Private,2015-09-30 16:44:31,2015-09-30 16:45:41,0.85,0,5,36,36,55.708408,12.573179,55.708411,12.57318
707917,Customer,812521,WBY1Z21000V308050,I3,No,9328531658,Private,2015-10-03 11:21:43,2015-10-03 11:36:58,6.78,4,1,36,31,55.708411,12.573181,55.699591,12.587339


In [236]:
dataf = CarID_dict['WBY1Z21050V308092'].sort_values(by = 'Reservation_Time')
dataf[dataf.RentalID > 9364300000].sort_values(by = 'RentalID').iloc[:10]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
786108,Customer,825458,WBY1Z21050V308092,I3,No,9364306334,Private,2017-10-03 20:17:35,2017-10-03 20:47:55,16.0,2,6,65,61,55.677683,12.548381,55.679523,12.548036
786202,Customer,2447793,WBY1Z21050V308092,I3,No,9364309196,Private,2017-10-03 21:40:12,2017-10-03 21:55:57,24.0,5,6,61,57,55.679523,12.548036,55.707466,12.502664
786580,Customer,873998,WBY1Z21050V308092,I3,No,9364325052,Private,2017-10-04 08:37:06,2017-10-04 09:00:01,28.8,5,11,57,54,55.707466,12.502664,55.687243,12.530347
786688,Customer,818687,WBY1Z21050V308092,I3,No,9364333296,Private,2017-10-04 09:52:33,2017-10-05 03:01:23,48.0,4,65,54,48,55.687243,12.530347,55.705353,12.554032
786807,Customer,832589,WBY1Z21050V308092,I3,No,9364339979,Private,2017-10-04 12:43:56,2017-10-04 13:07:50,48.0,11,9,48,34,55.705353,12.554032,55.783894,12.520369
786957,Customer,2446893,WBY1Z21050V308092,I3,No,9364347209,Private,2017-10-04 14:50:58,2017-10-04 15:09:11,16.0,11,4,15,95,55.783894,12.520369,55.694751,12.552206
787637,Customer,2316609,WBY1Z21050V308092,I3,No,9364367849,Private,2017-10-04 20:56:46,2017-10-04 21:25:23,54.4,7,12,99,92,55.694751,12.552206,55.659729,12.604363
788028,Customer,997115,WBY1Z21050V308092,I3,No,9364383744,Private,2017-10-05 08:56:32,2017-10-05 09:07:55,9.6,2,6,91,90,55.659729,12.604363,55.668358,12.622162
788606,Customer,2265048,WBY1Z21050V308092,I3,No,9364411843,Private,2017-10-05 16:39:28,2017-10-05 16:53:37,35.2,4,3,90,86,55.668358,12.622162,55.670855,12.569195
788647,Customer,2112479,WBY1Z21050V308092,I3,No,9364413960,Private,2017-10-05 16:59:50,2017-10-05 17:17:43,19.2,2,12,86,84,55.670855,12.569195,55.65874,12.55805


### Check with start time

In [296]:
dfc = df.copy(deep = True)

In [297]:
dfc['Start_Time'] = [row.Reservation_Time+datetime.timedelta(minutes=row.Reservation_Minutes) for _, row in dfc.iterrows()]

In [298]:
CarID_dictc = dict(iter(dfc.groupby('CarID')))
tatc = []

for car,dataf in CarID_dictc.items():
    dataf = dataf.sort_values(by = 'Start_Time')
    tap = list( zip( dataf.iloc[np.where(dataf.Start_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]].Customer_Group.values, dataf.iloc[np.where(dataf.Start_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]+1].Customer_Group.values ) )
    tat.extend( tap )
    if (('Customer', 'Customer') in tap):
        print(car)

pd.Series(tat).value_counts()

WBA1R5103J7B14248
WBY1Z21000V308131
WBY1Z21000V308226
WBY1Z21010V307795
WBY1Z21010V307912
WBY1Z21010V308039
WBY1Z21010V308168
WBY1Z21010V308218
WBY1Z21020V307790
WBY1Z21020V308034
WBY1Z21020V308079
WBY1Z21020V308132
WBY1Z21020V308258
WBY1Z21030V307796
WBY1Z21030V307880
WBY1Z21030V308009
WBY1Z21030V308043
WBY1Z21030V308057
WBY1Z21030V308270
WBY1Z21040V307967
WBY1Z21050V307993
WBY1Z21050V308027
WBY1Z21050V308092
WBY1Z21060V307923
WBY1Z21060V307937
WBY1Z21060V308022
WBY1Z21070V308000
WBY1Z21080V308135
WBY1Z21090V307737
WBY1Z21090V307852
WBY1Z21090V307947
WBY1Z21090V308127
WBY1Z210X0V307908
WBY1Z210X0V308069
WBY1Z210X0V308251
WBY1Z6100HV939142
WMWXR3102KTK54716
WMWXR3108KTK54607


(Non_Customer, Non_Customer)    1915
(Customer, Non_Customer)         419
(Non_Customer, Customer)         347
(Customer, Customer)              64
dtype: int64

In [301]:
dataf

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long,Start_Time
1969792,Non_Customer,793665,WBY1Z21010V308039,I3,No,9326900666,Private,2015-08-28 12:10:16,2015-08-28 12:11:36,0.00,0,1,79,79,55.437536,11.813132,55.437521,11.813179,2015-08-28 12:11:16
1591407,Non_Customer,793639,WBY1Z21010V308039,I3,No,9327141869,Private,2015-09-03 00:10:10,2015-09-03 00:20:42,0.00,4,1641,75,71,55.437521,11.813179,55.679583,12.599486,2015-09-04 03:31:10
1593681,Non_Customer,819052,WBY1Z21010V308039,I3,No,9327420878,Private,2015-09-09 09:58:59,2015-09-09 10:29:52,0.00,5,3,69,62,55.679700,12.599553,55.676218,12.569544,2015-09-09 10:01:59
1594040,Customer,810198,WBY1Z21010V308039,I3,No,9327448053,Private,2015-09-09 20:48:31,2015-09-09 20:56:26,3.39,3,1,61,56,55.676218,12.569544,55.665574,12.593041,2015-09-09 20:49:31
1594147,Customer,814344,WBY1Z21010V308039,I3,No,9327458481,Private,2015-09-10 08:15:42,2015-09-10 08:25:55,4.66,1,13,56,54,55.665574,12.593041,55.662421,12.612711,2015-09-10 08:28:42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585088,Customer,3262345,WBY1Z21010V308039,I3,No,9421751453,Private,2019-09-17 19:51:26,2019-09-17 20:22:53,28.00,6,18,39,32,55.663207,12.575142,55.658574,12.530379,2019-09-17 20:09:26
585549,Non_Customer,1109469,WBY1Z21010V308039,I3,No,9421760452,Private,2019-09-17 21:56:20,2019-09-18 00:55:40,0.00,2,173,30,100,55.658574,12.530379,55.652028,12.527000,2019-09-18 00:49:20
586207,Customer,2450295,WBY1Z21010V308039,I3,No,9421774904,Private,2019-09-18 10:06:05,2019-09-18 10:22:52,0.00,7,7,100,95,55.652028,12.527000,55.624996,12.575595,2019-09-18 10:13:05
588288,Non_Customer,1109469,WBY1Z21010V308039,I3,No,9422044615,Private,2019-09-18 21:04:19,2019-09-19 04:13:44,0.00,17,383,95,77,55.624996,12.575595,55.716443,12.563968,2019-09-19 03:27:19


In [299]:
dataf = CarID_dictc['WBY1Z21010V308039'].sort_values(by = 'Start_Time')
dataf.iloc[np.where(dataf.Start_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long,Start_Time
402529,Customer,832528,WBY1Z21010V308039,I3,No,9330660292,Private,2015-11-21 14:03:51,2015-11-21 14:19:22,6.78,6,8,88,80,55.683434,12.528627,55.706441,12.486606,2015-11-21 14:11:51
383253,Customer,867823,WBY1Z21010V308039,I3,No,9333220122,Private,2016-01-23 21:51:14,2016-01-23 21:52:35,0.0,0,0,34,34,55.67106,12.564882,55.671059,12.564882,2016-01-23 21:51:14
349323,Non_Customer,999604,WBY1Z21010V308039,I3,No,9334936366,Private,2016-03-03 13:18:47,2016-03-03 14:33:09,0.0,0,0,56,58,55.770042,12.518862,55.770343,12.518876,2016-03-03 13:18:47
988116,Non_Customer,793639,WBY1Z21010V308039,I3,No,9353862302,Private,2017-03-29 15:05:00,2017-03-30 16:26:50,0.0,0,5,0,0,55.708533,12.478805,0.0,0.0,2017-03-29 15:10:00
2230718,Non_Customer,1109469,WBY1Z21010V308039,I3,No,9376268126,Private,2018-04-23 21:37:35,2018-04-25 02:09:11,0.0,0,5,0,0,55.6635,12.517549,0.0,0.0,2018-04-23 21:42:35
2571338,Non_Customer,1109469,WBY1Z21010V308039,I3,No,9379435024,Private,2018-06-13 22:11:54,2018-06-15 02:19:50,0.0,0,5,0,0,55.735886,12.351707,0.0,0.0,2018-06-13 22:16:54


In [300]:
dataf.iloc[np.where(dataf.Start_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]+1]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long,Start_Time
402484,Non_Customer,793639,WBY1Z21010V308039,I3,No,9330659140,Private,2015-11-21 13:29:44,2015-11-21 13:40:50,0.0,2,46,92,88,55.676084,12.560566,55.683434,12.528627,2015-11-21 14:15:44
383251,Customer,874495,WBY1Z21010V308039,I3,No,9333220029,Private,2016-01-23 21:41:22,2016-01-23 21:45:30,2.12,0,10,37,34,55.66817,12.559453,55.67106,12.564882,2016-01-23 21:51:22
349301,Non_Customer,793639,WBY1Z21010V308039,I3,No,9334935150,Private,2016-03-03 12:43:41,2016-03-03 12:58:07,0.0,10,53,70,57,55.726236,12.574211,55.770042,12.518862,2016-03-03 13:36:41
986972,Non_Customer,869885,WBY1Z21010V308039,I3,No,9353809112,Private,2017-03-29 15:56:35,2017-03-29 16:05:02,0.0,2,2,45,96,55.708533,12.478805,55.705183,12.498154,2017-03-29 15:58:35
2228963,Non_Customer,1113943,WBY1Z21010V308039,I3,No,9376208000,Private,2018-04-24 01:40:06,2018-04-24 01:43:24,0.0,0,1,49,98,55.6635,12.517549,55.662846,12.517828,2018-04-24 01:41:06
2569073,Non_Customer,1117595,WBY1Z21010V308039,I3,No,9379364717,Private,2018-06-14 01:52:58,2018-06-14 01:58:46,0.0,0,2,74,99,55.735886,12.351707,55.735886,12.351707,2018-06-14 01:54:58


In [229]:
dataf[dataf.RentalID > 9403500000].sort_values(by = 'RentalID').iloc[:20]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long,Start_Time
1813423,Customer,2300888,WBA1R5102J5K58060,118I,No,9403502836,Private,2019-05-10 23:57:28,2019-05-11 00:22:51,36.4,6,5,38,34,55.706899,12.587438,55.66678,12.557047,2019-05-11 00:02:28
1813752,Customer,3318166,WBA1R5102J5K58060,118I,No,9403509498,Private,2019-05-11 07:24:54,2019-05-11 23:56:48,560.0,139,13,25,96,55.66678,12.557047,55.684589,12.585798,2019-05-11 07:37:54
1816905,Customer,2508033,WBA1R5102J5K58060,118I,No,9403569080,Private,2019-05-12 00:25:50,2019-05-12 00:48:16,38.4,5,11,94,92,55.684589,12.585798,55.683218,12.615176,2019-05-12 00:36:50
1816911,Customer,1035652,WBA1R5102J5K58060,118I,No,9403569145,Private,2019-05-12 00:01:56,2019-05-12 00:22:51,9.6,0,19,94,94,55.684589,12.585798,55.684589,12.585798,2019-05-12 00:20:56
1817211,Customer,3287152,WBA1R5102J5K58060,118I,No,9403573761,Private,2019-05-12 06:20:56,2019-05-12 06:59:18,36.4,16,18,94,90,55.683218,12.615176,55.732443,12.443289,2019-05-12 06:38:56
1818605,Customer,3287152,WBA1R5102J5K58060,118I,No,9403599314,Private,2019-05-12 15:38:45,2019-05-12 16:08:41,43.33,17,6,90,86,55.732443,12.443289,55.684339,12.614014,2019-05-12 15:44:45
1819410,Customer,2449258,WBA1R5102J5K58060,118I,No,9403613814,Private,2019-05-12 20:05:08,2019-05-12 20:15:31,2.4,2,3,86,86,55.684339,12.614014,55.691529,12.612028,2019-05-12 20:08:08
1819550,Customer,804684,WBA1R5102J5K58060,118I,No,9403616244,Business,2019-05-12 20:41:54,2019-05-12 21:21:45,48.53,13,13,86,84,55.691529,12.612028,55.696796,12.553215,2019-05-12 20:54:54
1819920,Customer,803905,WBA1R5102J5K58060,118I,No,9403623244,Private,2019-05-13 01:13:35,2019-05-13 01:57:52,92.8,31,16,84,78,55.696796,12.553215,55.69533,12.552633,2019-05-13 01:29:35
1820646,Customer,2940802,WBA1R5102J5K58060,118I,No,9403646161,Private,2019-05-13 10:25:18,2019-05-13 15:19:31,320.0,76,7,78,67,55.69533,12.552633,55.709499,12.588269,2019-05-13 10:32:18


## Other columns

In [29]:
df[df.Rental_flag == 'Yes']

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,...,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long,Start_Time,Start_Zone,End_Zone
92,Customer,1041131,WBY1Z21030V307927,I3,Yes,9336190387,Private,2016-01-04 08:14:43,2016-01-04 08:32:31,8.80,...,8,57,50,55.671970,12.561281,55.654693,12.612746,2016-01-04 08:22:43,102182,103224
162,Customer,341960,WBY1Z210X0V308248,I3,Yes,9336195348,Private,2016-01-04 09:13:18,2016-01-04 09:26:30,6.01,...,3,100,90,55.666223,12.544056,55.632389,12.574830,2016-01-04 09:16:18,102821,103291
226,Customer,19617,WBY1Z21020V308048,I3,Yes,9336201372,Private,2016-01-04 11:08:34,2016-01-04 11:43:16,15.03,...,4,71,57,55.687231,12.549290,55.692560,12.546405,2016-01-04 11:12:34,102444,102453
227,Customer,24827,WBY1Z21040V307791,I3,Yes,9336201380,Private,2016-01-04 11:08:48,2016-01-04 11:49:28,18.68,...,9,62,51,55.632424,12.644685,55.681026,12.604476,2016-01-04 11:17:48,185125,103132
298,Customer,110192,WBY1Z210X0V308072,I3,Yes,9336207064,Private,2016-01-04 12:49:24,2016-01-04 13:12:59,11.38,...,9,55,49,55.708624,12.578704,55.671733,12.539777,2016-01-04 12:58:24,102343,147131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2633885,Customer,440147,WBY1Z21060V307954,I3,Yes,9345006009,Private,2016-09-30 20:54:26,2016-09-30 21:16:58,9.88,...,8,55,44,55.662403,12.623958,55.710676,12.566043,2016-09-30 21:02:26,103151,102412
2633903,Customer,1032015,WBY1Z21010V307814,I3,Yes,9345007233,Private,2016-09-30 21:16:24,2016-09-30 22:02:05,16.12,...,11,83,97,55.662269,12.604593,55.685559,12.586609,2016-09-30 21:27:24,103172,102223
2633919,Customer,457665,WBY1Z21060V308070,I3,Yes,9345008207,Private,2016-09-30 21:52:27,2016-09-30 22:29:45,22.67,...,1,56,25,55.630208,12.648793,55.665015,12.556939,2016-09-30 21:53:27,185203,102812
2633966,Customer,427906,WBY1Z21080V307955,I3,Yes,9345010629,Private,2016-09-30 23:02:50,2016-09-30 23:13:13,5.37,...,1,97,92,55.665828,12.565080,55.682833,12.584502,2016-09-30 23:03:50,102181,102223


In [31]:
df.sort_values(by = 'Reservation_Minutes', ascending=False).iloc[:42]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,...,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long,Start_Time,Start_Zone,End_Zone
1493665,Non_Customer,793639,WBY1Z21050V307993,I3,No,9407597629,Private,2019-02-05 14:44:54,2019-06-18 16:08:50,0.0,...,59009,0,0,55.634966,12.650418,0.0,0.0,2019-03-18 14:13:54,185121,550062
1508476,Non_Customer,793639,WBY1Z21050V307766,I3,No,9408404530,Private,2019-05-13 14:02:03,2019-06-18 16:08:51,0.0,...,50140,0,0,55.635113,12.650348,0.0,0.0,2019-06-17 09:42:03,185121,550062
1508467,Non_Customer,793639,WBY1Z21060V307999,I3,No,9408400365,Private,2019-05-13 14:05:58,2019-06-18 16:08:50,0.0,...,50136,0,0,55.634987,12.650426,0.0,0.0,2019-06-17 09:41:58,185121,550062
1508474,Non_Customer,793639,WBY1Z21030V307989,I3,No,9408401227,Private,2019-05-13 14:06:03,2019-06-18 16:08:51,0.0,...,50135,0,0,55.635009,12.65049,0.0,0.0,2019-06-17 09:41:03,185121,550062
1499203,Non_Customer,793639,WBY1Z210X0V307911,I3,No,9408102693,Private,2019-05-13 13:22:19,2019-06-18 16:08:50,0.0,...,45960,0,0,55.634986,12.650334,0.0,0.0,2019-06-14 11:22:19,185121,550062
1499161,Non_Customer,793639,WBY1Z21080V308040,I3,No,9408101113,Private,2019-05-13 13:19:56,2019-06-18 16:08:51,0.0,...,45949,0,0,55.634991,12.650396,0.0,0.0,2019-06-14 11:08:56,185121,550062
1511323,Non_Customer,793639,WBY1Z21060V308098,I3,No,9408845467,Private,2019-05-27 00:54:05,2019-06-18 16:08:49,0.0,...,32225,0,0,55.634453,12.649101,0.0,0.0,2019-06-18 09:59:05,185121,550062
1688993,Non_Customer,793639,WBY1Z21090V308094,I3,No,9427800504,Private,2019-07-10 03:58:33,2019-10-29 16:50:51,0.0,...,31834,5,0,55.71624,12.56701,0.0,0.0,2019-08-01 06:32:33,102412,550062
1511307,Non_Customer,793639,WBY1Z210X0V308007,I3,No,9408845049,Private,2019-05-27 09:11:10,2019-06-18 16:08:49,0.0,...,31722,0,0,55.635099,12.650519,0.0,0.0,2019-06-18 09:53:10,185121,550062
1763108,Non_Customer,793639,WBA1R5104J5K58061,118I,No,9402424970,Private,2019-03-04 19:56:41,2019-04-25 15:36:02,0.0,...,31069,0,0,55.677168,12.580413,0.0,0.0,2019-03-26 09:45:41,102111,550062


In [33]:
df[df.CarID == 'WBY1Z21050V307993'].sort_values('Reservation_Time').iloc[-550:-540]
#df[df.CarID == 'WBY1Z21050V307993'].loc[1493665]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,...,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long,Start_Time,Start_Zone,End_Zone
1702579,Customer,3257292,WBY1Z21050V307993,I3,No,9400375651,Private,2019-02-04 18:27:14,2019-02-04 18:55:37,60.8,...,10,38,29,55.659379,12.630345,55.667792,12.545928,2019-02-04 18:37:14,103212,102821
1702760,Customer,2985859,WBY1Z21050V307993,I3,No,9400379233,Private,2019-02-04 19:16:57,2019-02-04 19:35:49,17.33,...,10,29,24,55.667792,12.545928,55.686779,12.536951,2019-02-04 19:26:57,102821,147161
1702938,Customer,828693,WBY1Z21050V307993,I3,No,9400382472,Private,2019-02-04 20:20:58,2019-02-04 20:51:08,64.0,...,11,5,100,55.686779,12.536951,55.654816,12.618666,2019-02-04 20:31:58,147161,103223
1783729,Non_Customer,1112124,WBY1Z21050V307993,I3,No,9402866073,Private,2019-02-05 00:23:35,2019-02-05 02:10:30,0.0,...,0,75,48,55.69442,12.550729,55.634472,12.648757,2019-02-05 00:23:35,102443,185121
1493665,Non_Customer,793639,WBY1Z21050V307993,I3,No,9407597629,Private,2019-02-05 14:44:54,2019-06-18 16:08:50,0.0,...,59009,0,0,55.634966,12.650418,0.0,0.0,2019-03-18 14:13:54,185121,550062
1113833,Customer,1015209,WBY1Z21050V307993,I3,No,9394090697,Private,2019-02-13 07:42:30,2019-02-13 08:13:58,22.53,...,19,58,51,55.713129,12.572044,55.701208,12.600449,2019-02-13 08:01:30,102324,102336
1114545,Customer,3219128,WBY1Z21050V307993,I3,No,9394109802,Business,2019-02-13 13:22:25,2019-02-13 13:58:53,80.0,...,2,51,39,55.701208,12.600449,55.63022,12.64893,2019-02-13 13:24:25,102336,185203
1114742,Customer,1053005,WBY1Z21050V307993,I3,No,9394115235,Private,2019-02-13 14:52:45,2019-02-13 15:23:13,43.07,...,4,39,19,55.63022,12.64893,55.679108,12.479413,2019-02-13 14:56:45,185203,102634
1115510,Customer,2658148,WBY1Z21050V307993,I3,No,9394130425,Private,2019-02-13 18:44:27,2019-02-13 19:24:02,51.2,...,8,19,10,55.679108,12.479413,55.674144,12.571492,2019-02-13 18:52:27,102634,102131
1116382,Customer,812023,WBY1Z21050V307993,I3,No,9394146782,Private,2019-02-14 05:20:48,2019-02-14 05:56:35,84.8,...,17,100,90,55.674144,12.571492,55.630231,12.64884,2019-02-14 05:37:48,102131,185203


## Start time

In [None]:
# Add start time based on Reservation minutes
df['Start_Time'] = [row.Reservation_Time+datetime.timedelta(minutes=row.Reservation_Minutes) for _, row in df.iterrows()]

# Create Vacancy

In [33]:
# Haversine function
def haversine(point1, point2):
    # convert decimal degrees to radians
    lat1, lon1 = map(np.radians, point1)
    lat2, lon2 = map(np.radians, point2)

    # Deltas
    delta_lon = lon2 - lon1 
    delta_lat = lat2 - lat1 
    
    # haversine formula 
    a = np.sin(delta_lat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(delta_lon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371000 # Radius of earth in m
    return c * r

In [34]:
df_sorted = df.sort_values("Reservation_Time")
df_sorted.CarID.nunique()

1021

In [35]:
data = []
for i, car in enumerate(df_sorted.CarID.unique()):
    if car == '-':
        continue
    car_sub_df = df_sorted[df_sorted.CarID == car]
    if not i%50:
        print(f'{i} cars processed')
    for (_, row1), (_, row2) in zip(car_sub_df[:-1].iterrows(),car_sub_df[1:].iterrows()):
        park_time = row1['End_Time']
        reservation_time = row2['Reservation_Time']
        start_time = row2['Start_Time']
        time_to_reservation = (row2['Reservation_Time']-row1['End_Time']).total_seconds()/3600
        time_to_start = (row2['Start_Time']-row1['End_Time']).total_seconds()/3600
        park_location_lat = row1['End_Lat']
        park_location_long = row1['End_Long']
        park_zone = row1['End_Zone']
        park_fuel = row1['Fuel_End']
        leave_fuel = row2['Fuel_Start']
        engine = row1['Engine']
        moved = haversine(row1.loc[['End_Lat','End_Long']].values, row2.loc[['Start_Lat','Start_Long']].values) 
        data.append([car, park_time,reservation_time, start_time, time_to_reservation, time_to_start, park_location_lat, park_location_long, park_zone, park_fuel, leave_fuel, engine, moved])

0 cars processed
10 cars processed
20 cars processed
30 cars processed
40 cars processed
50 cars processed
60 cars processed
70 cars processed
80 cars processed
90 cars processed
100 cars processed
110 cars processed
120 cars processed
130 cars processed
140 cars processed
150 cars processed
160 cars processed
170 cars processed
180 cars processed
190 cars processed
200 cars processed
210 cars processed
220 cars processed
230 cars processed
240 cars processed
250 cars processed
260 cars processed
270 cars processed
280 cars processed
290 cars processed
300 cars processed
310 cars processed
320 cars processed
330 cars processed
340 cars processed
350 cars processed
360 cars processed
370 cars processed
380 cars processed
390 cars processed
400 cars processed
410 cars processed
420 cars processed
430 cars processed
440 cars processed
450 cars processed
460 cars processed
470 cars processed
480 cars processed
490 cars processed
500 cars processed
510 cars processed
520 cars processed
530 

In [36]:
# Create new df
df_vacancy = pd.DataFrame(data = data, columns = ['car', 'park_time', 'reservation_time', 'start_time','time_to_reservation', 'time_to_start', 'park_location_lat', 'park_location_long', 'park_zone', 'park_fuel', 'leave_fuel', 'engine', 'moved'])

# Infer types
df_vacancy = df_vacancy.convert_dtypes()

# Save
df_vacancy.to_csv('data/processed/Vacancy_new.csv')

In [39]:
df_vacancy[df_vacancy.park_location_lat < 10]

Unnamed: 0,car,park_time,reservation_time,start_time,time_to_reservation,time_to_start,park_location_lat,park_location_long,park_zone,park_fuel,leave_fuel,engine,moved
6484,WBY1Z21010V307859,2019-09-23 16:36:47,2019-10-01 02:02:55,2019-10-01 02:04:55,177.435556,177.468889,0.0,0.0,550062,0,56,I3,6292768.460909
20262,WBY1Z21000V307884,2019-06-18 16:08:03,2019-07-01 06:25:50,2019-07-01 06:48:50,302.296389,302.679722,0.0,0.0,550062,0,100,I3,6295269.445084
25593,WBY1Z210X0V307858,2019-09-25 16:37:29,2019-10-01 16:50:39,2019-10-01 16:59:39,144.219444,144.369444,0.0,0.0,550062,0,77,I3,6291430.710595
31020,WBY1Z21030V308205,2019-01-10 16:39:24,2019-01-13 00:30:05,2019-01-13 00:40:05,55.844722,56.011389,0.0,0.0,550062,0,87,I3,6296310.222621
33636,WBY1Z21050V307976,2017-07-14 03:30:25,2017-08-01 15:15:53,2017-08-01 15:20:53,443.757778,443.841111,0.0,0.0,550062,98,59,I3,6295540.691409
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2449916,WBY8P2104K7D92193,2019-10-09 16:32:02,2019-11-05 06:58:03,2019-11-05 06:59:03,638.433611,638.450278,0.0,0.0,550062,0,62,I3 120,6291983.829059
2464979,WBY8P2104K7D89939,2019-08-29 16:28:31,2019-09-05 13:54:16,2019-09-05 13:56:16,165.429167,165.4625,0.0,0.0,550062,0,100,I3 120,6291369.414998
2552340,WBA1R5100K7D67738,2019-07-29 16:19:41,2019-08-06 00:31:12,2019-08-06 00:37:12,176.191944,176.291944,0.0,0.0,550062,0,94,118I,6298022.719552
2586727,WBA31AA05L3H87545,2019-05-12 12:30:42,2019-05-10 14:27:27,2019-05-10 14:42:27,-46.054167,-45.804167,0.0,0.0,550062,0,76,X1 SDRIVE18I,6294458.041067
