In [172]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import glob
import numpy as np
import networkx as nx
import tqdm
import datetime
import geopandas as gpd
import rtree

In [173]:
files = glob.glob("data/raw/SNData/*.csv")

dfs = []
for f in tqdm.tqdm(files):
    dfs.append(pd.read_csv(f, header=0, sep=";"))

Full_data = pd.concat(dfs,ignore_index=True) # Save this to interim
Full_data.to_csv('data/interim/Full_data.csv')

100%|██████████| 53/53 [00:08<00:00,  6.16it/s]


In [174]:
# Drop 53 rows with na values
df = Full_data.dropna()

# Rename Columns to English
df. columns = ['Customer_Group', 'CustomerID', 'CarID', 'Engine', 'Rental_flag', 'RentalID', 'Rental_Usage_Type', 'Reservation_Time', 'End_Time', 'Revenue', 'Distance', 'Drives', 'Reservation_Minutes','Fuel_Start','Fuel_End','Start_Lat', 'Start_Long', 'End_Lat', 'End_Long']

# Fix type
df = df.astype({'CustomerID': 'int32', 'RentalID': 'int64'})

# Drop drives as it has no info (only ones)
df.drop(columns = 'Drives', inplace=True)
df

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
0,Non_Customer,793639,WBY1Z21080V307924,I3,No,9335872135,Private,24.03.2016 11:48:43,02.04.2016 10:00:19,0.00,0,0,0,0,55.678763,12.552853,0.000000,0.000000
1,Non_Customer,1035973,WBY1Z21080V307857,I3,No,9336114126,Private,30.03.2016 15:37:39,01.04.2016 00:40:38,0.00,0,0,62,47,55.770626,12.519300,55.770389,12.518839
2,Non_Customer,998095,WBY1Z21020V307904,I3,No,9336153910,Private,31.03.2016 13:08:16,05.04.2016 08:32:25,0.00,2,1,85,79,55.621588,12.606951,55.621532,12.606279
3,Non_Customer,999604,WBY1Z21010V307926,I3,No,9336158303,Private,31.03.2016 14:43:00,01.04.2016 07:10:00,0.00,0,1,0,71,55.770077,12.518914,55.769746,12.519123
4,Non_Customer,1035969,WBY1Z21070V308210,,No,9336160465,Private,31.03.2016 15:21:36,01.04.2016 14:24:17,0.00,0,1,53,52,55.770623,12.519791,55.770439,12.518937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2633979,Customer,1070662,WBY1Z21010V308185,I3,No,9345011102,Private,30.09.2016 23:39:05,30.09.2016 23:50:54,5.16,4,10,46,41,55.694700,12.553776,55.678740,12.587144
2633980,Customer,1041705,WBY1Z21080V308250,I3,No,9345011139,Private,30.09.2016 23:42:18,30.09.2016 23:52:14,3.44,6,8,59,52,55.648401,12.542945,55.641310,12.615295
2633981,Customer,2112471,WBY1Z21020V308261,I3,No,9345011311,Private,30.09.2016 23:33:39,30.09.2016 23:52:03,8.17,9,3,39,30,55.664744,12.580875,55.719856,12.540863
2633982,Customer,440147,WBY1Z21060V307954,I3,Yes,9345011420,Private,30.09.2016 23:41:56,30.09.2016 23:57:30,6.88,9,4,44,35,55.710676,12.566043,55.667453,12.619987


In [3]:
# Remove all rows with a CarID as it can not be used
df = df[df.CarID != '0']

# Remoce DK from CarID so the same car does not have two id's
df['CarID'] = df['CarID'].str.replace('DK','')

In [176]:
# Engine has two types of missing values that is alligned
df["Engine"].replace({" ": '0'}, inplace=True)

In [177]:
# If a CarID already has an engine type assign that to the missing ones
Engine_dict = {c: df[df.CarID == c].Engine.nunique() for c in df[df.Engine == '0'].CarID.unique()}
for car, engine in Engine_dict.items():
    if engine == 1:
        continue
    True_Engine = [x for x in df[df.CarID == car].Engine.unique() if x!= '0'][0]
    df.loc[(df.CarID == car) & (df.Engine == '0'), 'Engine'] = True_Engine

# Populate the rest manual based on ID
df.loc[(df.CarID == 'WBA1R5104J7B14310') & (df.Engine == '0'), 'Engine'] = '118I'
df.loc[(df.CarID == 'WBA1R5104J5K58061') & (df.Engine == '0'), 'Engine'] = '118I'
df.loc[(df.CarID == 'WBA1R5103K7D66678') & (df.Engine == '0'), 'Engine'] = '118I'
df.loc[(df.CarID == 'WBY8P2105K7D70350') & (df.Engine == '0'), 'Engine'] = 'I3 120'
df.loc[(df.CarID == 'WBY8P2102K7D70287') & (df.Engine == '0'), 'Engine'] = 'I3 120'

## Times

In [178]:
df['Reservation_Time'] = pd.to_datetime(df['Reservation_Time'], format="%d.%m.%Y %H:%M:%S")
df['End_Time'] = pd.to_datetime(df['End_Time'], format="%d.%m.%Y %H:%M:%S")

## Fix trips where same user use same car

In [179]:
# Split data on Car level
CarID_dict = dict(iter(df.groupby('CarID')))

def fix_merges(dataframe, max_time_diff = 60):
    dataframe = dataframe.sort_values(by = 'Reservation_Time')
    # Get index where same customer uses the same car back to back
    diff0_iloc = [dataframe.index.get_loc(x) for x in dataframe.index[(dataframe.CustomerID.diff() == 0).tolist()]]

    # Find paris to be merged
    merge_pairs = [(idx-1,idx) for idx in diff0_iloc if dataframe.iloc[idx-1].End_Time+pd.to_timedelta(max_time_diff+dataframe.iloc[idx].Reservation_Minutes,'m') > dataframe.iloc[idx].Reservation_Time]

    # Model as graph to get cc
    graph_model = nx.Graph(merge_pairs)
    groups = [(min(cc),max(cc)) for cc in list(nx.connected_components(graph_model))]

    # Populate 
    for pair in groups:
        dataframe.loc[dataframe.index[pair[0]],['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']] = dataframe.loc[dataframe.index[pair[1]],['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']]


    # Delete now unwanted rows
    rows_to_delete = [x[1] for x in merge_pairs]
    dataframe.drop(index = [dataframe.index[x] for x in rows_to_delete], inplace = True)

    # Return fixed dataframe
    return dataframe

# Merge new datasets
dfs = []
for sub_df in tqdm.tqdm(CarID_dict.values()):
    dfs.append(fix_merges(sub_df))

df = pd.concat(dfs,ignore_index=False).sort_values(by = 'RentalID')

100%|██████████| 1021/1021 [09:10<00:00,  1.86it/s]


## Fix wierd times

In [180]:
# Winter Time
WinterTimeIndex = df[(df.Reservation_Time > df.End_Time) & (df.End_Time.apply(lambda x: x.month) == 10) & (df.End_Time.apply(lambda x: x.hour) < 4)].index
WinterTimeIndexBack = [2179859, 1683947, 1683948]
WinterTimeIndexForward = [x for x in WinterTimeIndex if x not in WinterTimeIndexBack]
df.loc[WinterTimeIndexBack, 'Reservation_Time'] = df.loc[WinterTimeIndexBack, 'Reservation_Time'] - pd.to_timedelta(1,'h')
df.loc[WinterTimeIndexForward, 'End_Time'] = df.loc[WinterTimeIndexForward, 'End_Time'] + pd.to_timedelta(1,'h')

# Remove remaining 50 observations as they will not introduce more vacancy time
df.drop(index = df[df.Reservation_Time > df.End_Time].index, inplace = True)

## Merge Non-Customers

In [181]:
# Split data on Car level
CarID_dict = dict(iter(df.groupby('CarID')))

def merge_NC(dataframe):
    dataframe = dataframe.sort_values(by = 'Reservation_Time')
    # Get index where non_customer
    is_NC = dataframe.Customer_Group == 'Non_Customer'

    # Find paris to be merged
    merge_pairs = [(is_NC.index.get_loc(k1),is_NC.index.get_loc(k2)) for (k1, v1),(k2,v2) in zip(is_NC.iloc[:-1].iteritems(),is_NC.iloc[1:].iteritems()) if v1&v2]

    # Model as graph to get cc
    graph_model = nx.Graph(merge_pairs)
    groups = [(min(cc),max(cc)) for cc in list(nx.connected_components(graph_model))]

    # Populate 
    for pair in groups:
        dataframe.loc[dataframe.index[pair[0]],['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']] = dataframe.loc[dataframe.index[pair[1]],['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']]

    # Delete now unwanted rows
    rows_to_delete = [x[1] for x in merge_pairs]
    dataframe.drop(index = [dataframe.index[x] for x in rows_to_delete], inplace = True)

    # Return fixed dataframe
    return dataframe

# Merge new datasets
dfs = []
for sub_df in tqdm.tqdm(CarID_dict.values()):
    dfs.append(merge_NC(sub_df))

df = pd.concat(dfs,ignore_index=False).sort_values(by = 'Reservation_Time')

100%|██████████| 1021/1021 [00:47<00:00, 21.70it/s]


## Fix overlap

In [182]:
CarID_dict = dict(iter(df.groupby('CarID')))
tat = []
endtat0 = []
endtat1 = []
endtat2 = []
endtat3 = []

for car,dataf in CarID_dict.items():
    dataf = dataf.sort_values(by = 'Reservation_Time')
    tap = list( zip( dataf.iloc[np.where(dataf.Reservation_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]].Customer_Group.values, dataf.iloc[np.where(dataf.Reservation_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]+1].Customer_Group.values ) )
    tat.extend( tap )

    endtat0.extend( dataf.iloc[np.where(dataf.Reservation_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]].index )
    endtat1.extend( dataf.iloc[np.where(dataf.Reservation_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]].Customer_Group )
    endtat2.extend( dataf.iloc[np.where(dataf.Reservation_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]+1].Customer_Group )
    endtat3.extend( dataf.iloc[np.where(dataf.Reservation_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]].End_Lat )

overlap_df = pd.DataFrame(data=[endtat0,endtat1,endtat2,endtat3]).T

In [183]:
# Fix those with bad end_loc
fix_idx0 = overlap_df[(overlap_df[1] == 'Customer') & (overlap_df[3] < 1)][0].values
df.loc[fix_idx0, 'End_Time'] = df.loc[fix_idx0, 'Reservation_Time'].values + pd.to_timedelta(1,'m')

In [22]:
# Haversine function
def haversine(point1, point2):
    # convert decimal degrees to radians
    lat1, lon1 = map(np.radians, point1)
    lat2, lon2 = map(np.radians, point2)

    # Deltas
    delta_lon = lon2 - lon1 
    delta_lat = lat2 - lat1 
    
    # haversine formula 
    a = np.sin(delta_lat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(delta_lon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371000 # Radius of earth in m
    return c * r

In [184]:
# Fix the other C-C to average of the two reservation times
fix_idxP = overlap_df[(overlap_df[1] == 'Customer') & (overlap_df[3] > 1)][0].values
    
for fix_idx in fix_idxP:
    # Get sub_df
    tmp_car_df = df[df.CarID == df.loc[fix_idx].CarID]
    
    # Get iloc in sub_df of to be fixed
    fix_iloc = tmp_car_df.index.get_loc(fix_idx)

    # Get end loc of curent and start of next
    end_loc = tmp_car_df.loc[fix_idx, ['End_Lat', 'End_Long']].values
    start_loc = tmp_car_df.loc[tmp_car_df.index[fix_iloc+1], ['Start_Lat', 'Start_Long']].values

    # If parked at same place adjust
    if haversine(end_loc, start_loc) < 100:
        avg_time = df.loc[fix_idx,'Reservation_Time'] + (df.loc[tmp_car_df.index[fix_iloc+1],'Reservation_Time'] - df.loc[fix_idx,'Reservation_Time']) / 2
        df.loc[fix_idx,'End_Time'] = avg_time

In [185]:
# Manual fixes/guestimates
df.loc[51903,'End_Time'] = pd.Timestamp("2016-11-03 20:00:00")
df.loc[661452,'End_Time'] = pd.Timestamp("2017-12-01 17:00:00")
df.loc[52806,'End_Time'] = pd.Timestamp("2016-11-05 08:00:10")
df.loc[2376045,'Reservation_Time'] = pd.Timestamp("2016-08-05 12:49:38")
df.loc[661513,'End_Time'] = pd.Timestamp("2017-12-02 16:16:24")
df.loc[784104,'End_Time'] = pd.Timestamp("2017-10-04 12:20:10")

df.drop(index = [22088, 25828, 809192, 664080, 1137264, 713741, 1604116, 2470015, 404202, 661521, 404308], inplace = True)

In [186]:
fix_idxCNC = overlap_df[(overlap_df[1]=='Customer') & (overlap_df[2]=='Non_Customer')][0].values
for fix_idx in fix_idxCNC:
    # Get sub_df
    tmp_car_df = df[df.CarID == df.loc[fix_idx].CarID]
    
    # Get iloc in sub_df of to be fixed
    fix_iloc = tmp_car_df.index.get_loc(fix_idx)

    # Replace values
    df.loc[fix_idx,['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']] = df.loc[tmp_car_df.index[fix_iloc+1],['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']]

    # Dtop the old NC row
    df.drop(tmp_car_df.index[fix_iloc+1], inplace = True)

df.drop(index = 888104, inplace = True)

In [187]:
fix_idx_NC0 = overlap_df[(overlap_df[1] ==  'Non_Customer') & (overlap_df[3] < 1)][0].values
to_drop = []

for fix_idx in fix_idx_NC0:
    # Get sub_df
    try:
        tmp_car_df = df[df.CarID == df.loc[fix_idx].CarID].sort_values(by = 'Reservation_Time')
    except:
        continue
    
    # Get iloc in sub_df of to be fixed
    fix_iloc = tmp_car_df.index.get_loc(fix_idx)

    # Get the two start locs
    start_loc0 = tmp_car_df.loc[tmp_car_df.index[fix_iloc-1], ['Start_Lat', 'Start_Long']].values
    start_loc1 = tmp_car_df.loc[fix_idx, ['Start_Lat', 'Start_Long']].values
    start_loc2 = tmp_car_df.loc[tmp_car_df.index[fix_iloc+1], ['Start_Lat', 'Start_Long']].values

    # If left same spot then drop
    if haversine(start_loc0, start_loc1) < 100:
        to_drop.append(fix_idx)
    if haversine(start_loc1, start_loc2) < 100:
        to_drop.append(fix_idx)
        
df.drop(index = to_drop, inplace = True)

In [188]:
fix_idx_RM = [x for x in overlap_df[(overlap_df[1] == 'Non_Customer' ) & (overlap_df[3] < 1)][0].values if x in df.index]

for fix_idx in fix_idx_RM:
    df.loc[fix_idx, 'End_Time'] = df.loc[fix_idx,'Reservation_Time']+pd.to_timedelta(df.loc[fix_idx,'Reservation_Minutes'], 'm')

In [12]:
# Save interim
df.to_csv('data/interim/first_version.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2410931 entries, 1969578 to 1457879
Data columns (total 18 columns):
 #   Column               Dtype         
---  ------               -----         
 0   Customer_Group       object        
 1   CustomerID           int64         
 2   CarID                object        
 3   Engine               object        
 4   Rental_flag          object        
 5   RentalID             int64         
 6   Rental_Usage_Type    object        
 7   Reservation_Time     datetime64[ns]
 8   End_Time             datetime64[ns]
 9   Revenue              float64       
 10  Distance             int64         
 11  Reservation_Minutes  int64         
 12  Fuel_Start           int64         
 13  Fuel_End             int64         
 14  Start_Lat            float64       
 15  Start_Long           float64       
 16  End_Lat              float64       
 17  End_Long             float64       
dtypes: datetime64[ns](2), float64(5), int64(6), object(5)
me

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import glob
import numpy as np
import networkx as nx
import tqdm
import datetime
import geopandas as gpd
import rtree

df = pd.read_csv('data/interim/first_version.csv', index_col = 0)

In [2]:
df['Reservation_Time'] = pd.to_datetime(df['Reservation_Time'], format="%Y.%m.%d %H:%M:%S")
df['End_Time'] = pd.to_datetime(df['End_Time'], format="%Y.%m.%d %H:%M:%S")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2410931 entries, 1969578 to 1457879
Data columns (total 18 columns):
 #   Column               Dtype         
---  ------               -----         
 0   Customer_Group       object        
 1   CustomerID           int64         
 2   CarID                object        
 3   Engine               object        
 4   Rental_flag          object        
 5   RentalID             int64         
 6   Rental_Usage_Type    object        
 7   Reservation_Time     datetime64[ns]
 8   End_Time             datetime64[ns]
 9   Revenue              float64       
 10  Distance             int64         
 11  Reservation_Minutes  int64         
 12  Fuel_Start           int64         
 13  Fuel_End             int64         
 14  Start_Lat            float64       
 15  Start_Long           float64       
 16  End_Lat              float64       
 17  End_Long             float64       
dtypes: datetime64[ns](2), float64(5), int64(6), object(5)
me

## Fix 0,0 locations

We also accept the other ones outside Copenhagen as the cars must have been there. They can be removed in the vacancy dataset

In [None]:
for i, row in df[(df.Start_Lat < 5)].iterrows():
    # Skip if first instance as it will unaffect vacancy
    sub_df = df[df.CarID == row.CarID].sort_values('RentalID')
    err_index = sub_df.index.get_loc(i)
    if err_index == 0:
        continue

    # Populate based on previous end 
    df.loc[i, ['Start_Lat', 'Start_Long']] = sub_df.iloc[err_index-1].loc[['End_Lat','End_Long']].values

In [None]:
for i, row in df[(df.End_Lat < 5)].iterrows():
    sub_df = df[df.CarID == row.CarID].sort_values('RentalID')
    err_index = sub_df.index.get_loc(i)

    # Will fail if last index
    try:
        df.loc[i, ['End_Lat', 'End_Long']] = sub_df.iloc[err_index+1].loc[['Start_Lat','Start_Long']].values
    except:
        continue
    

## Add zones

In [None]:
# Load shapefile and set projection
shapefile = gpd.read_file("../Zonekort/LTM_Zone3/zones_level3.shp")
shapefile = shapefile.to_crs(epsg=4326)

In [None]:
# Create a geoDF with geometry as starting point
gdf_start = gpd.GeoDataFrame(df, geometry= gpd.points_from_xy(df.Start_Long, df.Start_Lat))

# Set projection
gdf_start = gdf_start.set_crs(epsg=4326)

In [None]:
# Populate zones based on which zone they are within
gdpj_start  = gpd.sjoin(gdf_start, shapefile, op='within')
df['Start_Zone'] = gdpj_start.zoneid

In [None]:
# Populate the rest based on which zone they are closest too
Start_zone_filler = {x: shapefile.zoneid[shapefile.distance(df.loc[x].geometry).sort_values().index[0]] for x in df.index[df['Start_Zone'].isna()]}
df['Start_Zone'] = df['Start_Zone'].fillna(Start_zone_filler)

In [None]:
# Create a geoDF with geometry as end point
gdf_end = gpd.GeoDataFrame(df, geometry= gpd.points_from_xy(df.End_Long, df.End_Lat))

# Set projection
gdf_end = gdf_end.set_crs(epsg=4326)

In [None]:
# Populate zones based on which zone they are within
gdpj_end  = gpd.sjoin(gdf_end, shapefile, op='within')
df['End_Zone'] = gdpj_end.zoneid

In [None]:
# Populate the rest based on which zone they are closest too
End_zone_filler = {x: shapefile.zoneid[shapefile.distance(df.loc[x].geometry).sort_values().index[0]] for x in df.index[df['End_Zone'].isna()]}
df['End_Zone'] = df['End_Zone'].fillna(End_zone_filler)

In [None]:
# Remove geomery type and make IDs int columns
df.drop(columns = 'geometry', inplace = True)
df = df.astype({'CustomerID': 'int32', 'RentalID': 'int64', 'Start_Zone': 'int32','End_Zone': 'int32'})

In [None]:
# Check types
df.info()

In [None]:
# Sweden and Bornholm
#df[df.Start_Long > 13].sort_values(by = 'Reservation_Time')

# Jutland
#df[(df.Start_Long < 11) & (df.Start_Long > 0) & (df.Customer_Group == 'Customer')]

# Car in Germany in the middle of the data..
#df[df.CarID == 'WBY1Z21040V308181'].sort_values(by = 'Reservation_Time').iloc[-30:-20]

In [10]:
x

## Fuel Status

In [114]:
CarID_dict = dict(iter(df.groupby('CarID')))
fuel_car0_dict = {cid: [] for cid in df.CarID.unique()}
for car, sub_df in CarID_dict.items():
    sub_df = sub_df.sort_values(by = 'Reservation_Time')
    
    zero_idx = np.where(sub_df.Fuel_Start == 0)[0]
    for d, idxes in zip(np.diff(zero_idx) == 1, zip(zero_idx[:-1],zero_idx[1:])):
        if d:
           fuel_car0_dict[car].append(idxes) 
    

In [4]:
oldfs = df.Fuel_Start
sum(oldfs == 0)

147696

In [6]:
# Split data on Car level
CarID_dict = dict(iter(df.groupby('CarID')))

def fuel_fixer(dataframe):
    dataframe = dataframe.sort_values(by = 'Reservation_Time')
    #
    idx_to_fix = [x for x in sub_df[sub_df.Fuel_Start <= 0].index if sub_df.index.get_loc(x)>0]

    idx_fixer = [sub_df.index[sub_df.index.get_loc(x)-1] for x in idx_to_fix if sub_df.index.get_loc(x)>0]
    pair_fixes = zip(idx_to_fix,idx_fixer)

    for to_fix,fixer in pair_fixes:
        if dataframe.loc[fixer,'Fuel_End'] >= 0:
            dataframe.loc[to_fix, 'Fuel_Start'] = df.loc[fixer,'Fuel_End']


    # Return fixed dataframe
    return dataframe

# Merge new datasets
dfs = []
for sub_df in tqdm.tqdm(CarID_dict.values()):
    dfs.append(fuel_fixer(sub_df))

df = pd.concat(dfs,ignore_index=False).sort_values(by = 'Reservation_Time')

100%|██████████| 993/993 [00:33<00:00, 29.43it/s] 


In [8]:
newfs = df.Fuel_Start
sum(newfs == 0)

117903

In [19]:
df[df.Fuel_Start == 0]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
1969957,Non_Customer,793661,WBY1Z21090V307866,I3,No,9326991963,Private,2015-08-30 16:22:48,2015-09-19 17:40:28,0.00,11,5,0,99,55.692449,12.618301,55.693900,12.544552
1591207,Non_Customer,793664,WBY1Z21010V307859,I3,No,9327079965,Private,2015-09-01 15:02:03,2015-09-01 15:02:04,0.00,0,7512,0,0,55.691913,12.619063,0.000000,0.000000
1591426,Non_Customer,793639,WBY1Z21080V307924,I3,No,9327142226,Private,2015-09-03 00:54:05,2015-09-03 16:15:05,0.00,0,1681,0,65,55.692111,12.618713,55.696055,12.570749
395380,Non_Customer,793639,WBY1Z21020V308275,I3,No,9330199975,Private,2015-11-10 20:03:34,2015-11-12 15:03:57,0.00,0,294,0,98,55.706438,12.518067,55.702786,12.491789
399656,Customer,824100,WBY1Z21050V308027,I3,No,9330468627,Private,2015-11-17 08:29:39,2015-11-17 08:30:39,0.13,0,1,0,0,55.696048,12.580039,55.696048,12.580039
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1440280,Customer,3067562,WBY8P2101K7D94144,I3 120,No,9431179088,Private,2019-12-24 17:17:31,2019-12-24 17:18:31,0.11,0,1,0,0,55.678388,12.579626,55.678388,12.579626
1449566,Non_Customer,793639,WBA1R5102K7D99249,118I,No,9434665052,Private,2019-12-29 07:24:35,2019-12-29 10:43:39,0.00,15,161,0,96,55.669981,12.391269,55.663517,12.503193
1451041,Customer,2661937,WBA1R5100J5K57960,118I,No,9434722739,Private,2019-12-29 18:54:38,2019-12-29 19:06:38,1.29,0,12,0,0,55.630506,12.648929,55.630506,12.648929
1451141,Customer,2056719,WBA1R5100J5K57960,118I,No,9434745604,Private,2019-12-29 19:48:23,2019-12-29 19:52:23,0.43,0,4,0,0,55.630506,12.648929,55.630506,12.648929


In [45]:
CarID_dict = dict(iter(df.groupby('CarID')))
fuel0frac = pd.Series({k:len(v[v.Fuel_Start == 0])/len(v) for k,v in CarID_dict.items() } )

In [48]:
fuel0frac.sort_values()

WBY8P2109K7E69947    0.000000
WBY8P2106K7D91112    0.000000
WBY8P2106K7D93202    0.000000
WBY8P2106K7D94138    0.000000
WBY8P2106K7D95239    0.000000
                       ...   
WBY1Z21060V308084    0.201395
WBY1Z21080V308233    0.206660
WBY1Z21060V308134    0.210974
WBY1Z210X0V307973    0.214286
WBY1Z210X0V307939    0.215866
Length: 993, dtype: float64

In [63]:
tmp = CarID_dict['WBY1Z210X0V307939']
[tmp.index.get_loc(x) for x in tmp[tmp.Fuel_Start == 0].index]

[949,
 950,
 951,
 952,
 953,
 955,
 956,
 957,
 958,
 959,
 960,
 961,
 962,
 964,
 967,
 968,
 969,
 970,
 971,
 974,
 975,
 976,
 977,
 978,
 983,
 984,
 985,
 986,
 987,
 988,
 989,
 990,
 993,
 994,
 995,
 996,
 997,
 998,
 999,
 1000,
 1006,
 1007,
 1008,
 1009,
 1010,
 1013,
 1016,
 1017,
 1018,
 1019,
 1020,
 1025,
 1026,
 1029,
 1030,
 1031,
 1033,
 1034,
 1035,
 1036,
 1037,
 1042,
 1043,
 1044,
 1045,
 1046,
 1047,
 1050,
 1051,
 1054,
 1055,
 1056,
 1057,
 1058,
 1059,
 1060,
 1061,
 1062,
 1063,
 1064,
 1065,
 1066,
 1069,
 1073,
 1074,
 1075,
 1078,
 1079,
 1080,
 1081,
 1083,
 1084,
 1085,
 1086,
 1087,
 1088,
 1089,
 1090,
 1091,
 1092,
 1093,
 1094,
 1095,
 1098,
 1099,
 1100,
 1101,
 1102,
 1104,
 1105,
 1106,
 1107,
 1108,
 1109,
 1110,
 1111,
 1112,
 1113,
 1114,
 1115,
 1116,
 1117,
 1118,
 1120,
 1121,
 1122,
 1123,
 1124,
 1125,
 1126,
 1127,
 1130,
 1131,
 1132,
 1133,
 1134,
 1137,
 1140,
 1141,
 1142,
 1143,
 1144,
 1145,
 1146,
 1147,
 1148,
 1149,
 1150,
 11

In [65]:
# 2017-03-07 22:22:54
tmp.iloc[945:1000]

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long,TripDist
957936,Customer,796501,WBY1Z210X0V307939,I3,No,9352333779,Business,2017-02-28 15:47:00,2017-02-28 15:58:04,32.0,3,1,37,31,55.688088,12.597065,55.671014,12.575508,2330.450788
958197,Customer,811967,WBY1Z210X0V307939,I3,No,9352343775,Private,2017-02-28 18:50:28,2017-02-28 19:36:55,116.8,6,13,31,16,55.671014,12.575508,55.678855,12.590234,1269.934031
958288,Customer,2013076,WBY1Z210X0V307939,I3,No,9352347335,Private,2017-02-28 20:10:18,2017-02-28 20:51:01,99.2,11,11,16,1,55.678855,12.590234,55.632424,12.63408,5849.875104
958820,Non_Customer,793639,WBY1Z210X0V307939,I3,No,9352376792,Private,2017-02-28 20:51:09,2017-03-07 19:12:51,0.0,0,967,1,0,55.632424,12.63408,55.630171,12.648962,967.172148
965728,Customer,1050031,WBY1Z210X0V307939,I3,No,9352706652,Private,2017-03-07 22:22:54,2017-03-07 23:08:00,86.8,14,21,0,0,55.630171,12.648962,55.706993,12.566365,9989.977758
966158,Customer,2363769,WBY1Z210X0V307939,I3,No,9352732409,Private,2017-03-08 13:51:06,2017-03-08 14:04:10,13.98,2,7,0,0,55.706993,12.566365,55.693466,12.558338,1586.003175
966732,Customer,2363769,WBY1Z210X0V307939,I3,No,9352756394,Private,2017-03-08 21:56:23,2017-03-08 22:18:43,21.97,4,12,0,0,55.693466,12.558338,55.707134,12.56714,1616.794865
966803,Customer,1066571,WBY1Z210X0V307939,I3,No,9352762254,Private,2017-03-09 06:16:47,2017-03-09 06:34:07,43.2,14,4,0,0,55.707134,12.56714,55.732397,12.442631,8288.487053
967738,Customer,1038047,WBY1Z210X0V307939,I3,No,9352807264,Private,2017-03-09 21:46:47,2017-03-09 22:12:55,33.6,25,5,0,0,55.732397,12.442631,55.625284,12.576157,14558.120827
968717,Non_Customer,793639,WBY1Z210X0V307939,I3,No,9352864969,Private,2017-03-10 19:33:32,2017-03-11 08:34:50,0.0,5,98,15,0,55.625284,12.576157,55.618892,12.572289,751.103335


## Cut 2018-2019

In [302]:
df1819 = df[df.Reservation_Time >= pd.Timestamp("2018-01-01")]
df1819['TripDist'] = df1819.apply(lambda x: haversine([x['Start_Lat'], x['Start_Long']], [x['End_Lat'], x['End_Long']]), axis = 1)

In [303]:
# Manuel -1 fixes. Remaining -1 are start so no prob there
df1819.loc[516417,'Fuel_End'], df1819.loc[516674,'Fuel_Start'] = 78,78
df1819.loc[1423849,'Fuel_End'], df1819.loc[1424064,'Fuel_Start'] = 92,92

In [304]:
CarID_dict = CarID_dict = dict(iter(df1819.groupby('CarID')))
for sub_df in tqdm.tqdm(CarID_dict.values()):
    sub_df = sub_df.sort_values(by = 'Reservation_Time')
    idx_fix_start = [sub_df.index.get_loc(x) for x in sub_df[sub_df.Fuel_Start == 0].index]

    if len(idx_fix_start) == 0:
        continue

    # Ensure the ones are lone
    idx_fix_start = [x for x in idx_fix_start if x-1 not in idx_fix_start and x+1 not in idx_fix_start]
    #idx_fix_end =[x-1 for x in idx_fix_start]

    for idx in idx_fix_start:
        # Get average
        replace_val = (sub_df.iloc[idx].Fuel_End+sub_df.iloc[idx-1].Fuel_Start)//2

        # Replace
        df1819.loc[sub_df.index[idx], 'Fuel_Start'] = replace_val
        df1819.loc[sub_df.index[idx-1], 'Fuel_End'] = replace_val

100%|██████████| 927/927 [00:09<00:00, 99.96it/s]


In [305]:
df1819.drop(index = df1819[(df1819.Fuel_Start == 0) & ((df1819.TripDist <= 0.1) | ((df1819.TripDist > 5000000)))].index, inplace = True)
# Manual fixes
df1819.drop(index = [221640, 223431, 224544, 227363, 263231], inplace = True)
df1819.loc[2108824,'Fuel_Start'] = 48
df1819.loc[208967,'Fuel_End'] = 74
df1819.loc[241387,'Fuel_Start'] = 74
df1819.loc[241387,'Fuel_End'] = 69
df1819.loc[241859,'Fuel_Start'] = 69
df1819.loc[241859,'Fuel_End'] = 62
df1819.loc[1853849,'Fuel_Start'] = 100

df1819.loc[1614655,'Fuel_Start'] = 100

# Car WBY1Z21020V307871
df1819.loc[2194695, 'Fuel_Start'] = 79
df1819.loc[2195546, 'Fuel_Start'] = 73
df1819.loc[2195720, 'Fuel_Start'] = 67
df1819.loc[2195864, 'Fuel_Start'] = 54
df1819.loc[2195905, 'Fuel_Start'] = 51
df1819.loc[2195934, 'Fuel_Start'] = 48
df1819.loc[2196040, 'Fuel_Start'] = 46
df1819.loc[2196073, 'Fuel_Start'] = 44

# Car WBY8P2105K7D70350
df1819.loc[1810440, 'Fuel_Start'] = 100
df1819.loc[1811631, 'Fuel_Start'] = 84
df1819.loc[1812237, 'Fuel_Start'] = 73
df1819.loc[1813020, 'Fuel_Start'] = 70
df1819.loc[1814957, 'Fuel_Start'] = 68
df1819.loc[1815464, 'Fuel_Start'] = 63
df1819.loc[1818056, 'Fuel_Start'] = 61
df1819.loc[1818503, 'Fuel_Start'] = 55
df1819.loc[1818835, 'Fuel_Start'] = 54
df1819.loc[1821416, 'Fuel_Start'] = 51
df1819.loc[1822755, 'Fuel_Start'] = 50

In [350]:
df1819[df1819.Fuel_Start == 0].CarID.value_counts().sort_values()

WBY1Z21060V307923    1
WMWXU7100KTM90318    1
WBY1Z21080V308183    1
WBY1Z210X0V308038    1
WBY1Z21020V307871    1
WBA1R5102J7B13379    1
WBA1R5101K5N19311    1
WBY8P2100K7D69154    1
WBY8P2105K7D69151    1
WBY8P2109K7D77205    1
Name: CarID, dtype: int64

In [345]:
tmp = df1819[df1819.CarID == 'WBY8P2104K7E14094']
idx = [tmp.index.get_loc(x) for x in tmp[tmp.Fuel_Start <= 0].index][0]
print(idx)
tmp.iloc[(idx-0):(idx+3)]
# WBA1R5107J7B13376, WBA1R5104J5K57959 er anderledes
# WBY8P2104K7E14094 er den første

0


Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long,TripDist
1614655,Non_Customer,793639,WBY8P2104K7E14094,I3 120,No,9424820837,Private,2019-10-03 10:03:05,2019-10-03 10:10:01,0.0,1,2,0,0,55.716348,12.567579,55.715927,12.575498,498.213637
1616646,Customer,2944285,WBY8P2104K7E14094,I3 120,No,9424872988,Private,2019-10-03 21:57:37,2019-10-03 22:28:55,0.0,7,12,100,97,55.715927,12.575498,55.672489,12.54966,5094.290254
1616781,Customer,3300835,WBY8P2104K7E14094,I3 120,No,9424875622,Private,2019-10-03 23:01:54,2019-10-03 23:23:39,51.2,5,7,97,95,55.672489,12.54966,55.680402,12.582048,2213.124656


## Start time

In [None]:
# Add start time based on Reservation minutes
df['Start_Time'] = [row.Reservation_Time+datetime.timedelta(minutes=row.Reservation_Minutes) for _, row in df.iterrows()]

# Create Vacancy

In [None]:
# Haversine function
def haversine(point1, point2):
    # convert decimal degrees to radians
    lat1, lon1 = map(np.radians, point1)
    lat2, lon2 = map(np.radians, point2)

    # Deltas
    delta_lon = lon2 - lon1 
    delta_lat = lat2 - lat1 
    
    # haversine formula 
    a = np.sin(delta_lat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(delta_lon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371000 # Radius of earth in m
    return c * r

In [None]:
df_sorted = df.sort_values("Reservation_Time")
df_sorted.CarID.nunique()

In [None]:
data = []
for i, car in enumerate(df_sorted.CarID.unique()):
    if car == '-':
        continue
    car_sub_df = df_sorted[df_sorted.CarID == car]
    if not i%50:
        print(f'{i} cars processed')
    for (_, row1), (_, row2) in zip(car_sub_df[:-1].iterrows(),car_sub_df[1:].iterrows()):
        park_time = row1['End_Time']
        reservation_time = row2['Reservation_Time']
        start_time = row2['Start_Time']
        time_to_reservation = (row2['Reservation_Time']-row1['End_Time']).total_seconds()/3600
        time_to_start = (row2['Start_Time']-row1['End_Time']).total_seconds()/3600
        park_location_lat = row1['End_Lat']
        park_location_long = row1['End_Long']
        park_zone = row1['End_Zone']
        park_fuel = row1['Fuel_End']
        leave_fuel = row2['Fuel_Start']
        engine = row1['Engine']
        moved = haversine(row1.loc[['End_Lat','End_Long']].values, row2.loc[['Start_Lat','Start_Long']].values) 
        data.append([car, park_time,reservation_time, start_time, time_to_reservation, time_to_start, park_location_lat, park_location_long, park_zone, park_fuel, leave_fuel, engine, moved])

In [None]:
# Create new df
df_vacancy = pd.DataFrame(data = data, columns = ['car', 'park_time', 'reservation_time', 'start_time','time_to_reservation', 'time_to_start', 'park_location_lat', 'park_location_long', 'park_zone', 'park_fuel', 'leave_fuel', 'engine', 'moved'])

# Infer types
df_vacancy = df_vacancy.convert_dtypes()

# Save
df_vacancy.to_csv('data/processed/Vacancy_new.csv')

In [None]:
df_vacancy[df_vacancy.park_location_lat < 10]