In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import glob
import numpy as np
import networkx as nx
import tqdm
import datetime
import geopandas as gpd
import rtree

In [2]:
files = glob.glob("data/raw/SNData/*.csv")

dfs = []
for f in files:
    dfs.append(pd.read_csv(f, header=0, sep=";"))

Full_data = pd.concat(dfs,ignore_index=True) # Save this to interim
Full_data.to_csv('data/interim/Full_data.csv')

In [3]:
# Drop 53 rows with na values
df = Full_data.dropna()

# Rename Columns to English
df. columns = ['Customer_Group', 'CustomerID', 'CarID', 'Engine', 'Rental_flag', 'RentalID', 'Rental_Usage_Type', 'Reservation_Time', 'End_Time', 'Revenue', 'Distance', 'Drives', 'Reservation_Minutes','Fuel_Start','Fuel_End','Start_Lat', 'Start_Long', 'End_Lat', 'End_Long']

# Fix type
df = df.astype({'CustomerID': 'int32', 'RentalID': 'int64'})

# Drop drives as it has no info (only ones)
df.drop(columns = 'Drives', inplace=True)
df

Unnamed: 0,Customer_Group,CustomerID,CarID,Engine,Rental_flag,RentalID,Rental_Usage_Type,Reservation_Time,End_Time,Revenue,Distance,Reservation_Minutes,Fuel_Start,Fuel_End,Start_Lat,Start_Long,End_Lat,End_Long
0,Non_Customer,793639,WBY1Z21080V307924,I3,No,9335872135,Private,24.03.2016 11:48:43,02.04.2016 10:00:19,0.00,0,0,0,0,55.678763,12.552853,0.000000,0.000000
1,Non_Customer,1035973,WBY1Z21080V307857,I3,No,9336114126,Private,30.03.2016 15:37:39,01.04.2016 00:40:38,0.00,0,0,62,47,55.770626,12.519300,55.770389,12.518839
2,Non_Customer,998095,WBY1Z21020V307904,I3,No,9336153910,Private,31.03.2016 13:08:16,05.04.2016 08:32:25,0.00,2,1,85,79,55.621588,12.606951,55.621532,12.606279
3,Non_Customer,999604,WBY1Z21010V307926,I3,No,9336158303,Private,31.03.2016 14:43:00,01.04.2016 07:10:00,0.00,0,1,0,71,55.770077,12.518914,55.769746,12.519123
4,Non_Customer,1035969,WBY1Z21070V308210,,No,9336160465,Private,31.03.2016 15:21:36,01.04.2016 14:24:17,0.00,0,1,53,52,55.770623,12.519791,55.770439,12.518937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2633979,Customer,1070662,WBY1Z21010V308185,I3,No,9345011102,Private,30.09.2016 23:39:05,30.09.2016 23:50:54,5.16,4,10,46,41,55.694700,12.553776,55.678740,12.587144
2633980,Customer,1041705,WBY1Z21080V308250,I3,No,9345011139,Private,30.09.2016 23:42:18,30.09.2016 23:52:14,3.44,6,8,59,52,55.648401,12.542945,55.641310,12.615295
2633981,Customer,2112471,WBY1Z21020V308261,I3,No,9345011311,Private,30.09.2016 23:33:39,30.09.2016 23:52:03,8.17,9,3,39,30,55.664744,12.580875,55.719856,12.540863
2633982,Customer,440147,WBY1Z21060V307954,I3,Yes,9345011420,Private,30.09.2016 23:41:56,30.09.2016 23:57:30,6.88,9,4,44,35,55.710676,12.566043,55.667453,12.619987


In [4]:
# Remove all rows with a CarID as it can not be used
df = df[df.CarID != '0']

# Remoce DK from CarID so the same car does not have two id's
df['CarID'] = df['CarID'].str.replace('DK','')

In [5]:
# Engine has two types of missing values that is alligned
df["Engine"].replace({" ": '0'}, inplace=True)

In [6]:
# If a CarID already has an engine type assign that to the missing ones
Engine_dict = {c: df[df.CarID == c].Engine.nunique() for c in df[df.Engine == '0'].CarID.unique()}
for car, engine in Engine_dict.items():
    if engine == 1:
        continue
    True_Engine = [x for x in df[df.CarID == car].Engine.unique() if x!= '0'][0]
    df.loc[(df.CarID == car) & (df.Engine == '0'), 'Engine'] = True_Engine

# Populate the rest manual based on ID
df.loc[(df.CarID == 'WBA1R5104J7B14310') & (df.Engine == '0'), 'Engine'] = '118I'
df.loc[(df.CarID == 'WBA1R5104J5K58061') & (df.Engine == '0'), 'Engine'] = '118I'
df.loc[(df.CarID == 'WBA1R5103K7D66678') & (df.Engine == '0'), 'Engine'] = '118I'
df.loc[(df.CarID == 'WBY8P2105K7D70350') & (df.Engine == '0'), 'Engine'] = 'I3 120'
df.loc[(df.CarID == 'WBY8P2102K7D70287') & (df.Engine == '0'), 'Engine'] = 'I3 120'

## Times

In [7]:
df['Reservation_Time'] = pd.to_datetime(df['Reservation_Time'], format="%d.%m.%Y %H:%M:%S")
df['End_Time'] = pd.to_datetime(df['End_Time'], format="%d.%m.%Y %H:%M:%S")

## Fix trips where same user use same car

In [8]:
# Split data on Car level
CarID_dict = dict(iter(df.groupby('CarID')))

def fix_merges(dataframe, max_time_diff = 60):
    dataframe = dataframe.sort_values(by = 'Reservation_Time')
    # Get index where same customer uses the same car back to back
    diff0_iloc = [dataframe.index.get_loc(x) for x in dataframe.index[(dataframe.CustomerID.diff() == 0).tolist()]]

    # Find paris to be merged
    merge_pairs = [(idx-1,idx) for idx in diff0_iloc if dataframe.iloc[idx-1].End_Time+pd.to_timedelta(max_time_diff+dataframe.iloc[idx].Reservation_Minutes,'m') > dataframe.iloc[idx].Reservation_Time]

    # Model as graph to get cc
    graph_model = nx.Graph(merge_pairs)
    groups = [(min(cc),max(cc)) for cc in list(nx.connected_components(graph_model))]

    # Populate 
    for pair in groups:
        dataframe.loc[dataframe.index[pair[0]],['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']] = dataframe.loc[dataframe.index[pair[1]],['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']]


    # Delete now unwanted rows
    rows_to_delete = [x[1] for x in merge_pairs]
    dataframe.drop(index = [dataframe.index[x] for x in rows_to_delete], inplace = True)

    # Return fixed dataframe
    return dataframe

# Merge new datasets
dfs = []
for sub_df in tqdm.tqdm(CarID_dict.values()):
    dfs.append(fix_merges(sub_df))

df = pd.concat(dfs,ignore_index=False).sort_values(by = 'RentalID')

100%|██████████| 993/993 [07:39<00:00,  2.16it/s]


## Fix wierd times

In [9]:
# Winter Time
WinterTimeIndex = df[(df.Reservation_Time > df.End_Time) & (df.End_Time.apply(lambda x: x.month) == 10) & (df.End_Time.apply(lambda x: x.hour) < 4)].index
WinterTimeIndexBack = [2179859, 1683947, 1683948]
WinterTimeIndexForward = [x for x in WinterTimeIndex if x not in WinterTimeIndexBack]
df.loc[WinterTimeIndexBack, 'Reservation_Time'] = df.loc[WinterTimeIndexBack, 'Reservation_Time'] - pd.to_timedelta(1,'h')
df.loc[WinterTimeIndexForward, 'End_Time'] = df.loc[WinterTimeIndexForward, 'End_Time'] + pd.to_timedelta(1,'h')

# Remove remaining 50 observations as they will not introduce more vacancy time
df.drop(index = df[df.Reservation_Time > df.End_Time].index, inplace = True)

## Merge Non-Customers

In [10]:
# Split data on Car level
CarID_dict = dict(iter(df.groupby('CarID')))

def merge_NC(dataframe):
    dataframe = dataframe.sort_values(by = 'Reservation_Time')
    # Get index where non_customer
    is_NC = dataframe.Customer_Group == 'Non_Customer'

    # Find paris to be merged
    merge_pairs = [(is_NC.index.get_loc(k1),is_NC.index.get_loc(k2)) for (k1, v1),(k2,v2) in zip(is_NC.iloc[:-1].iteritems(),is_NC.iloc[1:].iteritems()) if v1&v2]

    # Model as graph to get cc
    graph_model = nx.Graph(merge_pairs)
    groups = [(min(cc),max(cc)) for cc in list(nx.connected_components(graph_model))]

    # Populate 
    for pair in groups:
        dataframe.loc[dataframe.index[pair[0]],['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']] = dataframe.loc[dataframe.index[pair[1]],['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']]

    # Delete now unwanted rows
    rows_to_delete = [x[1] for x in merge_pairs]
    dataframe.drop(index = [dataframe.index[x] for x in rows_to_delete], inplace = True)

    # Return fixed dataframe
    return dataframe

# Merge new datasets
dfs = []
for sub_df in tqdm.tqdm(CarID_dict.values()):
    dfs.append(merge_NC(sub_df))

df = pd.concat(dfs,ignore_index=False).sort_values(by = 'Reservation_Time')

100%|██████████| 993/993 [00:43<00:00, 22.67it/s]


## Fix overlap

In [11]:
CarID_dict = dict(iter(df.groupby('CarID')))
tat = []
endtat0 = []
endtat1 = []
endtat2 = []
endtat3 = []

for car,dataf in CarID_dict.items():
    dataf = dataf.sort_values(by = 'Reservation_Time')
    tap = list( zip( dataf.iloc[np.where(dataf.Reservation_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]].Customer_Group.values, dataf.iloc[np.where(dataf.Reservation_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]+1].Customer_Group.values ) )
    tat.extend( tap )

    endtat0.extend( dataf.iloc[np.where(dataf.Reservation_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]].index )
    endtat1.extend( dataf.iloc[np.where(dataf.Reservation_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]].Customer_Group )
    endtat2.extend( dataf.iloc[np.where(dataf.Reservation_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]+1].Customer_Group )
    endtat3.extend( dataf.iloc[np.where(dataf.Reservation_Time.iloc[1:].values<dataf.End_Time.iloc[:-1].values)[0]].End_Lat )

overlap_df = pd.DataFrame(data=[endtat0,endtat1,endtat2,endtat3]).T

In [12]:
overlap_df

Unnamed: 0,0,1,2,3
0,1055312,Non_Customer,Customer,0.0
1,1167781,Customer,Non_Customer,55.781131
2,1656178,Customer,Non_Customer,55.634312
3,1052599,Non_Customer,Customer,0.0
4,1444470,Non_Customer,Customer,0.0
...,...,...,...,...
215,1813952,Non_Customer,Customer,0.0
216,1822676,Non_Customer,Customer,0.0
217,1304046,Non_Customer,Customer,0.0
218,1359243,Customer,Non_Customer,55.685669


In [15]:
# Fix those with bad end_loc
fix_idx0 = overlap_df[(overlap_df[1] == 'Customer') & (overlap_df[3] < 1)][0].values
df.loc[fix_idx0, 'End_Time'] = df.loc[fix_idx0, 'Reservation_Time'].values + pd.to_timedelta(1,'m')

In [19]:
# Haversine function
def haversine(point1, point2):
    # convert decimal degrees to radians
    lat1, lon1 = map(np.radians, point1)
    lat2, lon2 = map(np.radians, point2)

    # Deltas
    delta_lon = lon2 - lon1 
    delta_lat = lat2 - lat1 
    
    # haversine formula 
    a = np.sin(delta_lat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(delta_lon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371000 # Radius of earth in m
    return c * r

In [29]:
# Fix the other C-C to average of the two reservation times
fix_idxP = overlap_df[(overlap_df[1] == 'Customer') & (overlap_df[3] > 1)][0].values
    
for fix_idx in fix_idxP:
    # Get sub_df
    tmp_car_df = df[df.CarID == df.loc[fix_idx].CarID]
    
    # Get iloc in sub_df of to be fixed
    fix_iloc = tmp_car_df.index.get_loc(fix_idx)

    # Get end loc of curent and start of next
    end_loc = tmp_car_df.loc[fix_idx, ['End_Lat', 'End_Long']].values
    start_loc = tmp_car_df.loc[tmp_car_df.index[fix_iloc+1], ['Start_Lat', 'Start_Long']].values

    # If parked at same place adjust
    if haversine(end_loc, start_loc) < 100:
        avg_time = df.loc[fix_idx,'Reservation_Time'] + (df.loc[tmp_car_df.index[fix_iloc+1],'Reservation_Time'] - df.loc[fix_idx,'Reservation_Time']) / 2
        df.loc[fix_idx,'End_Time'] = avg_time

In [30]:
# Manual fixes/guestimates
df.loc[51903,'End_Time'] = pd.Timestamp("2016-11-03 20:00:00")
df.loc[661452,'End_Time'] = pd.Timestamp("2017-12-01 17:00:00")
df.loc[52806,'End_Time'] = pd.Timestamp("2016-11-05 08:00:10")
df.loc[2376045,'Reservation_Time'] = pd.Timestamp("2016-08-05 12:49:38")
df.loc[661513,'End_Time'] = pd.Timestamp("2017-12-02 16:16:24")
df.loc[784104,'End_Time'] = pd.Timestamp("2017-10-04 12:20:10")

df.drop(index = [22088, 25828, 809192, 664080, 1137264, 713741, 1604116, 2470015, 404202, 661521, 404308], inplace = True)

In [16]:
# Save interim
df.to_csv('data/interim/s_version.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2411071 entries, 1969578 to 1457863
Data columns (total 18 columns):
 #   Column               Dtype         
---  ------               -----         
 0   Customer_Group       object        
 1   CustomerID           int32         
 2   CarID                object        
 3   Engine               object        
 4   Rental_flag          object        
 5   RentalID             int64         
 6   Rental_Usage_Type    object        
 7   Reservation_Time     datetime64[ns]
 8   End_Time             datetime64[ns]
 9   Revenue              float64       
 10  Distance             int64         
 11  Reservation_Minutes  int64         
 12  Fuel_Start           int64         
 13  Fuel_End             int64         
 14  Start_Lat            float64       
 15  Start_Long           float64       
 16  End_Lat              float64       
 17  End_Long             float64       
dtypes: datetime64[ns](2), float64(5), int32(1), int64(5), ob

In [32]:
fix_idxCNC = overlap_df[(overlap_df[1]=='Customer') & (overlap_df[2]=='Non_Customer')][0].values
for fix_idx in fix_idxCNC:
    # Get sub_df
    tmp_car_df = df[df.CarID == df.loc[fix_idx].CarID]
    
    # Get iloc in sub_df of to be fixed
    fix_iloc = tmp_car_df.index.get_loc(fix_idx)

    # Replace values
    df.loc[fix_idx,['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']] = df.loc[tmp_car_df.index[fix_iloc+1],['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']]

    # Dtop the old NC row
    df.drop(tmp_car_df.index[fix_iloc+1], inplace = True)

#df.drop(index = 888104, inplace = True)

In [33]:
fix_idx_NC0 = overlap_df[(overlap_df[1] ==  'Non_Customer') & (overlap_df[3] < 1)][0].values
fix_idx_NC0

array([1055312, 1052599, 1444470, 995186, 2433923, 615309, 2540107,
       2397136, 1642807, 1619773, 2281113, 1211776, 1072166, 1279786,
       1279809, 982965, 440041, 2196102, 1250112, 1697536, 1716174,
       1783544, 1791055, 1475190, 442945, 449190, 586202, 600000, 960571,
       818752, 172717, 871418, 785515, 943031, 893511, 2272533, 2057969,
       2067461, 1136197, 1564453, 1280699, 850099, 785555, 2407154,
       854183, 986614, 2560234, 2160001, 987355, 1589134, 1578822,
       2367379, 986247, 438413, 579926, 625661, 2274667, 791672, 765940,
       248843, 2361115, 1138887, 654041, 2098815, 2132711, 975945,
       1138900, 2212041, 410530, 2139234, 2002473, 804400, 1773133,
       2367374, 1234799, 2012836, 1352766, 1061131, 2368344, 1565559,
       496290, 413857, 941872, 983015, 802628, 2220918, 672193, 2032504,
       966044, 632821, 763558, 2366647, 2128399, 868299, 1341035, 2407172,
       1229655, 662657, 1997594, 1259575, 2527189, 2533837, 1016940,
       1140421, 1

In [18]:
fix_idx_NC0 = overlap_df[(overlap_df[1] ==  'Non_Customer') & (overlap_df[3] < 1)][0].values
to_drop = []

for fix_idx in fix_idx_NC0:
    # Get sub_df
    try:
        tmp_car_df = df[df.CarID == df.loc[fix_idx].CarID].sort_values(by = 'Reservation_Time')
    except:
        continue
    
    # Get iloc in sub_df of to be fixed
    fix_iloc = tmp_car_df.index.get_loc(fix_idx)

    # Get the two start locs
    start_loc0 = tmp_car_df.loc[tmp_car_df.index[fix_iloc-1], ['Start_Lat', 'Start_Long']].values
    start_loc1 = tmp_car_df.loc[fix_idx, ['Start_Lat', 'Start_Long']].values
    start_loc2 = tmp_car_df.loc[tmp_car_df.index[fix_iloc+1], ['Start_Lat', 'Start_Long']].values

    # If left same spot then drop
    if haversine(start_loc0, start_loc1) < 100:
        to_drop.append(fix_idx)
    if haversine(start_loc1, start_loc2) < 100:
        to_drop.append(fix_idx)
        
df.drop(index = to_drop, inplace = True)

In [19]:
fix_idx_RM = [x for x in overlap_df[(overlap_df[1] == 'Non_Customer' ) & (overlap_df[3] < 1)][0].values if x in df.index]

for fix_idx in fix_idx_RM:
    df.loc[fix_idx, 'End_Time'] = df.loc[fix_idx,'Reservation_Time']+pd.to_timedelta(df.loc[fix_idx,'Reservation_Minutes'], 'm')

In [37]:
df[df.Reservation_Time >= pd.Timestamp("2018-01-01")].CarID.nunique()

927

## Cut out 2018 and 2019 and fix fuel

In [20]:
df1819 = df[df.Reservation_Time >= pd.Timestamp("2018-01-01")]
df1819['TripDist'] = df1819.apply(lambda x: haversine([x['Start_Lat'], x['Start_Long']], [x['End_Lat'], x['End_Long']]), axis = 1)

In [21]:
# Manuel -1 fixes. Remaining -1 are start so no prob there
df1819.loc[516417,'Fuel_End'], df1819.loc[516674,'Fuel_Start'] = 78,78
df1819.loc[1423849,'Fuel_End'], df1819.loc[1424064,'Fuel_Start'] = 92,92

In [22]:
# Fix single missing
CarID_dict = CarID_dict = dict(iter(df1819.groupby('CarID')))
for sub_df in tqdm.tqdm(CarID_dict.values()):
    sub_df = sub_df.sort_values(by = 'Reservation_Time')
    idx_fix_start = [sub_df.index.get_loc(x) for x in sub_df[sub_df.Fuel_Start == 0].index]

    if len(idx_fix_start) == 0:
        continue

    # Ensure the ones are lone
    idx_fix_start = [x for x in idx_fix_start if x-1 not in idx_fix_start and x+1 not in idx_fix_start]
    #idx_fix_end =[x-1 for x in idx_fix_start]

    for idx in idx_fix_start:
        # Get average
        replace_val = (sub_df.iloc[idx].Fuel_End+sub_df.iloc[idx-1].Fuel_Start)//2

        # Replace
        df1819.loc[sub_df.index[idx], 'Fuel_Start'] = replace_val
        df1819.loc[sub_df.index[idx-1], 'Fuel_End'] = replace_val

100%|██████████| 927/927 [02:01<00:00,  7.64it/s]


In [23]:
df1819.drop(index = df1819[(df1819.Fuel_Start <= 0) & ((df1819.TripDist <= 0.1) | ((df1819.TripDist > 5000000)))].index, inplace = True)
# Manual fixes
df1819.drop(index = [221640, 223431, 224544, 227363], inplace = True)
df1819.loc[2108824,'Fuel_Start'] = 48
df1819.loc[208967,'Fuel_End'] = 74
df1819.loc[241387,'Fuel_Start'] = 74
df1819.loc[241387,'Fuel_End'] = 69
df1819.loc[241859,'Fuel_Start'] = 69
df1819.loc[241859,'Fuel_End'] = 62
df1819.loc[1853849,'Fuel_Start'] = 100

df1819.loc[1614655,'Fuel_Start'] = 100

# Car WBY1Z21020V307871
df1819.loc[2194695, 'Fuel_Start'] = 79
df1819.loc[2195546, 'Fuel_Start'] = 73
df1819.loc[2195720, 'Fuel_Start'] = 67
df1819.loc[2195864, 'Fuel_Start'] = 54
df1819.loc[2195905, 'Fuel_Start'] = 51
df1819.loc[2195934, 'Fuel_Start'] = 48
df1819.loc[2196040, 'Fuel_Start'] = 46
df1819.loc[2196073, 'Fuel_Start'] = 44

# Car WBY8P2105K7D70350
df1819.loc[1810440, 'Fuel_Start'] = 100
df1819.loc[1811631, 'Fuel_Start'] = 84
df1819.loc[1812237, 'Fuel_Start'] = 73
df1819.loc[1813020, 'Fuel_Start'] = 70
df1819.loc[1814957, 'Fuel_Start'] = 68
df1819.loc[1815464, 'Fuel_Start'] = 63
df1819.loc[1818056, 'Fuel_Start'] = 61
df1819.loc[1818503, 'Fuel_Start'] = 55
df1819.loc[1818835, 'Fuel_Start'] = 54
df1819.loc[1821416, 'Fuel_Start'] = 51
df1819.loc[1822755, 'Fuel_Start'] = 50

df1819.loc[1993339, 'Fuel_Start'] = 15

In [29]:
print(len(df1819))
# Merge NC again due to some being dropped
# Split data on Car level
CarID_dict = dict(iter(df1819.groupby('CarID')))

def merge_NC(dataframe):
    dataframe = dataframe.sort_values(by = 'Reservation_Time')
    # Get index where non_customer
    is_NC = dataframe.Customer_Group == 'Non_Customer'

    # Find paris to be merged
    merge_pairs = [(is_NC.index.get_loc(k1),is_NC.index.get_loc(k2)) for (k1, v1),(k2,v2) in zip(is_NC.iloc[:-1].iteritems(),is_NC.iloc[1:].iteritems()) if v1&v2]

    # Model as graph to get cc
    graph_model = nx.Graph(merge_pairs)
    groups = [(min(cc),max(cc)) for cc in list(nx.connected_components(graph_model))]

    # Populate 
    for pair in groups:
        dataframe.loc[dataframe.index[pair[0]],['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']] = dataframe.loc[dataframe.index[pair[1]],['End_Time', 'Fuel_End', 'End_Lat', 'End_Long']]

    # Delete now unwanted rows
    rows_to_delete = [x[1] for x in merge_pairs]
    dataframe.drop(index = [dataframe.index[x] for x in rows_to_delete], inplace = True)

    # Return fixed dataframe
    return dataframe

# Merge new datasets
dfs = []
for sub_df in CarID_dict.values():
    dfs.append(merge_NC(sub_df))

df1819 = pd.concat(dfs,ignore_index=False).sort_values(by = 'Reservation_Time')
print(len(df1819))

1702133
1702120


In [30]:
df1819.to_csv('data/interim/t_version.csv')

In [31]:
# Fix missing where start is available
CarID_dict = CarID_dict = dict(iter(df1819.groupby('CarID')))
for sub_df in tqdm.tqdm(CarID_dict.values()):
    sub_df = sub_df.sort_values(by = 'Reservation_Time')
    idx_fix_start = [sub_df.index.get_loc(x) for x in sub_df[sub_df.Fuel_Start <= 0].index if x > 0]

    if len(idx_fix_start) == 0:
        continue

    idx_fix_start = [x for x in idx_fix_start if sub_df.loc[sub_df.index[x-1],'Fuel_End']>0]

    for idx in idx_fix_start:
        df1819.loc[sub_df.index[idx], 'Fuel_Start'] = df1819.loc[sub_df.index[idx-1], 'Fuel_End']

100%|██████████| 927/927 [00:17<00:00, 53.69it/s] 


In [32]:
for car in df1819[df1819.Fuel_Start <= 0].CarID.value_counts().keys():
    sub_df = df1819[df1819.CarID == car]
    idx_fix = [sub_df.index.get_loc(x) for x in sub_df[sub_df.Fuel_Start <= 0].index][0]
    
    if df1819.loc[sub_df.index[idx_fix],'Start_Lat'] == df1819.loc[sub_df.index[idx_fix-1],'Start_Lat']:
        df1819.loc[sub_df.index[idx_fix],'Fuel_Start'] = df1819.loc[sub_df.index[idx_fix-1],'Fuel_Start']

        df1819.drop(index = sub_df.index[idx_fix-1], inplace=True)

In [33]:
# LAst manual fixes
df1819.loc[2096334, 'Fuel_Start'] = 88
df1819.loc[2013446, 'Fuel_Start'] = 100
df1819.loc[2544201, 'Fuel_Start'] = 15
df1819.loc[192176, 'Fuel_Start'] = 95
df1819.loc[2111646, 'Fuel_Start'] = 22
df1819.loc[2184762, 'Fuel_Start'] = 10
df1819.loc[2474381, 'Fuel_Start'] = 5
df1819.loc[2499013, 'Fuel_Start'] = 10
df1819.loc[1798819, 'Fuel_Start'] = 6
df1819.loc[518233, 'Fuel_Start'] = 97
df1819.loc[585544, 'Fuel_Start'] = 90

## Fix 0,0 locations

We also accept the other ones outside Copenhagen as the cars must have been there. They can be removed in the vacancy dataset

In [49]:
for i, row in df1819[(df1819.Start_Lat < 5)].iterrows():
    # Skip if first instance as it will unaffect vacancy
    sub_df = df1819[df1819.CarID == row.CarID].sort_values('RentalID')
    err_index = sub_df.index.get_loc(i)
    if err_index == 0:
        continue

    # Populate based on previous end 
    df1819.loc[i, ['Start_Lat', 'Start_Long']] = sub_df.iloc[err_index-1].loc[['End_Lat','End_Long']].values

In [50]:
for i, row in df1819[(df1819.End_Lat < 5)].iterrows():
    sub_df = df1819[df1819.CarID == row.CarID].sort_values('RentalID')
    err_index = sub_df.index.get_loc(i)

    # Will fail if last index
    try:
        df1819.loc[i, ['End_Lat', 'End_Long']] = sub_df.iloc[err_index+1].loc[['Start_Lat','Start_Long']].values
    except:
        continue
    

## Add zones

In [51]:
# Load shapefile and set projection
shapefile = gpd.read_file("../Zonekort/LTM_Zone3/zones_level3.shp")
shapefile = shapefile.to_crs(epsg=4326)

In [53]:
# Create a geoDF with geometry as starting point
gdf_start = gpd.GeoDataFrame(df1819, geometry= gpd.points_from_xy(df1819.Start_Long, df1819.Start_Lat))

# Set projection
gdf_start = gdf_start.set_crs(epsg=4326)

In [54]:
# Populate zones based on which zone they are within
gdpj_start  = gpd.sjoin(gdf_start, shapefile, op='within')
df1819['Start_Zone'] = gdpj_start.zoneid

In [55]:
# Populate the rest based on which zone they are closest too
Start_zone_filler = {x: shapefile.zoneid[shapefile.distance(df1819.loc[x].geometry).sort_values().index[0]] for x in df1819.index[df1819['Start_Zone'].isna()]}
df1819['Start_Zone'] = df1819['Start_Zone'].fillna(Start_zone_filler)


  Start_zone_filler = {x: shapefile.zoneid[shapefile.distance(df1819.loc[x].geometry).sort_values().index[0]] for x in df1819.index[df1819['Start_Zone'].isna()]}


In [56]:
# Create a geoDF with geometry as end point
gdf_end = gpd.GeoDataFrame(df1819, geometry= gpd.points_from_xy(df1819.End_Long, df1819.End_Lat))

# Set projection
gdf_end = gdf_end.set_crs(epsg=4326)

In [57]:
# Populate zones based on which zone they are within
gdpj_end  = gpd.sjoin(gdf_end, shapefile, op='within')
df1819['End_Zone'] = gdpj_end.zoneid

In [59]:
# Populate the rest based on which zone they are closest too
End_zone_filler = {x: shapefile.zoneid[shapefile.distance(df1819.loc[x].geometry).sort_values().index[0]] for x in df1819.index[df1819['End_Zone'].isna()]}
df1819['End_Zone'] = df1819['End_Zone'].fillna(End_zone_filler)


  End_zone_filler = {x: shapefile.zoneid[shapefile.distance(df1819.loc[x].geometry).sort_values().index[0]] for x in df1819.index[df1819['End_Zone'].isna()]}


In [60]:
# Remove geomery type and make IDs int columns
df1819.drop(columns = 'geometry', inplace = True)
df1819 = df1819.astype({'CustomerID': 'int32', 'RentalID': 'int64', 'Start_Zone': 'int32','End_Zone': 'int32'})

In [61]:
# Check types
df1819.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1702109 entries, 2062415 to 1457863
Data columns (total 21 columns):
 #   Column               Dtype         
---  ------               -----         
 0   Customer_Group       object        
 1   CustomerID           int32         
 2   CarID                object        
 3   Engine               object        
 4   Rental_flag          object        
 5   RentalID             int64         
 6   Rental_Usage_Type    object        
 7   Reservation_Time     datetime64[ns]
 8   End_Time             datetime64[ns]
 9   Revenue              float64       
 10  Distance             int64         
 11  Reservation_Minutes  int64         
 12  Fuel_Start           int64         
 13  Fuel_End             int64         
 14  Start_Lat            float64       
 15  Start_Long           float64       
 16  End_Lat              float64       
 17  End_Long             float64       
 18  TripDist             float64       
 19  Start_Zone     

In [62]:
# Sweden and Bornholm
#df[df.Start_Long > 13].sort_values(by = 'Reservation_Time')

# Jutland
#df[(df.Start_Long < 11) & (df.Start_Long > 0) & (df.Customer_Group == 'Customer')]

# Car in Germany in the middle of the data..
#df[df.CarID == 'WBY1Z21040V308181'].sort_values(by = 'Reservation_Time').iloc[-30:-20]

## Start time

In [63]:
# Add start time based on Reservation minutes
df['Start_Time'] = [row.Reservation_Time+datetime.timedelta(minutes=row.Reservation_Minutes) for _, row in df.iterrows()]

# Create Vacancy

In [64]:
# Haversine function
def haversine(point1, point2):
    # convert decimal degrees to radians
    lat1, lon1 = map(np.radians, point1)
    lat2, lon2 = map(np.radians, point2)

    # Deltas
    delta_lon = lon2 - lon1 
    delta_lat = lat2 - lat1 
    
    # haversine formula 
    a = np.sin(delta_lat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(delta_lon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371000 # Radius of earth in m
    return c * r

In [65]:
df_sorted = df1819.sort_values("Reservation_Time")
df_sorted.CarID.nunique()

927

In [71]:
df_sorted.to_csv('data/processed/Full_data_set.csv')

In [69]:
CarID_dict = CarID_dict = dict(iter(df1819.groupby('CarID')))

data = []
for car, sub_df in tqdm.tqdm(CarID_dict.items()):
    for (_, row1), (_, row2) in zip(sub_df[:-1].iterrows(),sub_df[1:].iterrows()):
        park_time = row1['End_Time']
        reservation_time = row2['Reservation_Time']
        #start_time = row2['Start_Time']
        time_to_reservation = (row2['Reservation_Time']-row1['End_Time']).total_seconds()/3600
        #time_to_start = (row2['Start_Time']-row1['End_Time']).total_seconds()/3600
        park_location_lat = row1['End_Lat']
        park_location_long = row1['End_Long']
        leave_location_lat = row2['Start_Lat']
        leave_location_long = row2['Start_Long']
        park_zone = row1['End_Zone']
        leave_zone = row2['Start_Zone']
        park_fuel = row1['Fuel_End']
        leave_fuel = row2['Fuel_Start']
        engine = row1['Engine']
        moved = haversine(row1.loc[['End_Lat','End_Long']].values, row2.loc[['Start_Lat','Start_Long']].values) 
        prev_customer = row1['Customer_Group']
        next_customer = row2['Customer_Group']
        data.append([car, park_time,reservation_time, time_to_reservation, park_location_lat, park_location_long, leave_location_lat, leave_location_long, park_zone, leave_zone, park_fuel, leave_fuel, engine, moved, prev_customer, next_customer])

100%|██████████| 927/927 [21:40<00:00,  1.40s/it]


In [70]:
# Create new df
df_vacancy = pd.DataFrame(data = data, columns = ['car', 'park_time', 'reservation_time', 'time_to_reservation', 'park_location_lat', 'park_location_long', 'leave_location_lat', 'leave_location_long', 'park_zone', 'leave_zone', 'park_fuel', 'leave_fuel', 'engine', 'moved', 'prev_customer', 'next_customer'])

# Infer types
df_vacancy = df_vacancy.convert_dtypes()

# Save
df_vacancy.to_csv('data/processed/Vacancy_new.csv')

In [None]:
df_vacancy[df_vacancy.park_location_lat < 10]