- Which share of our owner’s revenue would potentially be affected by the feature?
- How many rentals would be affected by the feature depending on the threshold and scope we choose?
- How often are drivers late for the next check-in? How does it impact the next driver?
- How many problematic cases will it solve depending on the chosen threshold and scope?

In [64]:
import pandas as pd
import plotly.express as px

dataset = pd.read_excel('get_around_delay_analysis.xlsx')
dataset.describe(include='all').transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
rental_id,21310.0,,,,549712.880338,13863.446964,504806.0,540613.25,550350.0,560468.5,576401.0
car_id,21310.0,,,,350030.603426,58206.249765,159250.0,317639.0,368717.0,394928.0,417675.0
checkin_type,21310.0,2.0,mobile,17003.0,,,,,,,
state,21310.0,2.0,ended,18045.0,,,,,,,
delay_at_checkout_in_minutes,16346.0,,,,59.701517,1002.561635,-22433.0,-36.0,9.0,67.0,71084.0
previous_ended_rental_id,1841.0,,,,550127.411733,13184.023111,505628.0,540896.0,550567.0,560823.0,575053.0
time_delta_with_previous_rental_in_minutes,1841.0,,,,279.28843,254.594486,0.0,60.0,180.0,540.0,720.0


In [119]:
df_no_canceled = dataset.loc[dataset['state'] != "canceled"]
df_no_canceled['late_by'] = df_no_canceled['delay_at_checkout_in_minutes'].apply(lambda x : max(x,0))

latencies = df_no_canceled['late_by'].value_counts().reset_index()

overtime = pd.DataFrame(columns=["nb_late","percent_late"])
for delay in range (6):
    filtered_latencies = latencies.loc[latencies["late_by"] > (60*delay)]
    overtime.loc[delay,"nb_late"] = filtered_latencies['count'].sum()
    overtime.loc[delay,"percent_late"] = round(100 * filtered_latencies['count'].sum() / df_no_canceled['late_by'].count(), 1)

fig = px.area(overtime, y="nb_late")
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [129]:
fig = px.area(overtime, y="percent_late")
fig.show()

In [127]:
df_no_canceled_short_delay = df_no_canceled.loc[df_no_canceled['time_delta_with_previous_rental_in_minutes'].notna()]
df_no_canceled_short_delay['late_by'].count(), df_no_canceled['late_by'].count()

df_no_canceled_short_delay['real_delta'] = df_no_canceled_short_delay['time_delta_with_previous_rental_in_minutes'] - df_no_canceled_short_delay['late_by']
real_latencies = df_no_canceled_short_delay['real_delta'].value_counts().reset_index()
real_filtered_latencies = real_latencies.loc[(real_latencies["real_delta"] < 0) & (real_latencies["real_delta"] > -180)]
print(real_filtered_latencies['count'].sum(), f"({round(100* real_filtered_latencies['count'].sum() / df_no_canceled_short_delay['real_delta'].count(), 2)} %)")

for delay in range (6):
    real_filtered_latencies = real_latencies.loc[(real_latencies["real_delta"] < 0) & (real_latencies["real_delta"] < (-60*delay))]
    overtime.loc[delay,"nb_real_late"] = real_filtered_latencies['count'].sum()
    overtime.loc[delay,"percent_real_late"] = round(100 * real_filtered_latencies['count'].sum() / df_no_canceled_short_delay['real_delta'].count(), 1)

fig = px.area(overtime, y="nb_real_late")
fig.show()


201 (13.27 %)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [128]:
fig = px.area(overtime, y="percent_real_late")
fig.show()

In [110]:
px.histogram(real_filtered_latencies, y='real_delta', nbins=20)

In [111]:
px.histogram(df_no_canceled_short_delay, y='time_delta_with_previous_rental_in_minutes', nbins=20)

In [131]:
df_no_canceled_hours_delay = df_no_canceled_short_delay.loc[df_no_canceled_short_delay['time_delta_with_previous_rental_in_minutes'] < 300]
print(df_no_canceled_hours_delay['real_delta'].count(), f"({round(100* df_no_canceled_hours_delay['real_delta'].count() / df_no_canceled['late_by'].count(), 2)} %)")
px.histogram(df_no_canceled_hours_delay, y='time_delta_with_previous_rental_in_minutes', nbins=6)

921 (5.63 %)


In [5]:
dataset['state'].value_counts(), dataset['checkin_type'].value_counts()

(state
 ended       18045
 canceled     3265
 Name: count, dtype: int64,
 checkin_type
 mobile     17003
 connect     4307
 Name: count, dtype: int64)