# Import libraries

In [385]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

# Import data

In [386]:
df = pd.read_excel('src/get_around_delay_analysis.xlsx')

# Basic Statistics

In [387]:
display(df.describe(include='all'))

print("Missing value percentage :\n")
print(df.isnull().sum()/len(df)*100)

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes
count,21310.0,21310.0,21310,21310,16346.0,1841.0,1841.0
unique,,,2,2,,,
top,,,mobile,ended,,,
freq,,,17003,18045,,,
mean,549712.880338,350030.603426,,,59.701517,550127.411733,279.28843
std,13863.446964,58206.249765,,,1002.561635,13184.023111,254.594486
min,504806.0,159250.0,,,-22433.0,505628.0,0.0
25%,540613.25,317639.0,,,-36.0,540896.0,60.0
50%,550350.0,368717.0,,,9.0,550567.0,180.0
75%,560468.5,394928.0,,,67.0,560823.0,540.0


Missing value percentage :

rental_id                                      0.000000
car_id                                         0.000000
checkin_type                                   0.000000
state                                          0.000000
delay_at_checkout_in_minutes                  23.294228
previous_ended_rental_id                      91.360863
time_delta_with_previous_rental_in_minutes    91.360863
dtype: float64


# EDA

Feature description :

- **rental_id**: Unique identifier for each rental.
- **car_id**: Identifier for the car being rented.
- **checkin_type**: Either "mobile" or "connect", referring to how the check-in process was handled.
- **state**: The state of the rental (e.g., "ended", "canceled").
- **delay_at_checkout_in_minutes**: The delay in minutes at checkout (how late the car was returned).
- **previous_ended_rental_id**: The rental ID of the previous rental for that car.
- **time_delta_with_previous_rental_in_minutes**: The time difference between the end of the previous rental and the start of the current one (in minutes).

## Rate of late users

In [388]:
late_users = len(df.loc[df['delay_at_checkout_in_minutes'] > 0])
total_users = len(df)
late_user_ratio = late_users / total_users * 100

print(f"late users: {late_users}")
print(f"total users: {total_users}")
print(f"Users ratio late for checkout : {late_user_ratio:0.1f} %")


late users: 9404
total users: 21310
Users ratio late for checkout : 44.1 %


On 21 310 users,  **9404** where **late**.
<br>It represents an amount of **44,1%** of total users.

## Delay at checkout

In [389]:
print("Delay at checkout informations")
print(df['delay_at_checkout_in_minutes'].describe())

Delay at checkout informations
count    16346.000000
mean        59.701517
std       1002.561635
min     -22433.000000
25%        -36.000000
50%          9.000000
75%         67.000000
max      71084.000000
Name: delay_at_checkout_in_minutes, dtype: float64


Out of the dataset, there are 16,346 rentals with a valid delay time (not NaN).
<br>But some of them are **negative**, so we can assume that those values correspond to people who **returned their vehicles earlier** than expected.

The average delay is about **60** minutes, but this statistic is **skewed** by **extreme values** (with a minimum delay of -22,433 minutes and a maximum of 71,084 minutes).

The median delay is only **9** minutes, meaning half of the rentals are delayed by 9 minutes or less.

## Check-in Type and State

In [390]:
print("Count of rentals based on checkin type and state")
checkin_type_and_state = df.groupby(['checkin_type', 'state']).size()
print(df.groupby(['checkin_type', 'state']).size())

Count of rentals based on checkin type and state
checkin_type  state   
connect       canceled      798
              ended        3509
mobile        canceled     2467
              ended       14536
dtype: int64


In [391]:
fig = make_subplots(rows=1, cols=2, shared_yaxes=True, subplot_titles=('Connect users',  'Mobile users'))

fig.add_trace(go.Bar(x=df['state'].unique(),
                      y=checkin_type_and_state['connect'],
                      name='Connect',
                      text=checkin_type_and_state['connect'],
                      textposition='auto',
                    ),
              row=1,
              col=1)

fig.add_trace(go.Bar(x=df['state'].unique(),
                      y=checkin_type_and_state['mobile'],
                      name='Mobile',
                      text=checkin_type_and_state['mobile']
                    ),
              row=1,
              col=2)
    
fig.update_layout(height=800,
                  width=800,
                  title_text='Check-in type and state distribution',                
                )
fig.show()

In [392]:
connect_users = df[df['checkin_type'] == 'connect']

connect_users_ratio = len(connect_users)/total_users*100
print(f"Connect users represents {connect_users_ratio:0.1f} % of the total users")

Connect users represents 20.2 % of the total users


Connect users represent only 20% of the total, and are therefore in the minority.

In [393]:
late_connect_users = df.loc[(df['delay_at_checkout_in_minutes'] > 0) & (df['checkin_type'] == 'connect')]

late_connect_users_ratio = len(late_connect_users)/late_users*100
late_connect_users_ratio

print(f"late Connect users ratio: {late_connect_users_ratio:0.1f} %")

late Connect users ratio: 15.5 %


**Connect users** represents only **15,5 %** of late users.
<br>Applying the feature to only a portion of users could be interesting to reduce impact on owner's shares.

In [395]:
# Analyzing checkin types associated with multi-rental cars

# Grouping the data by car_id to find cars with multiple rentals
multi_rental_cars = df.groupby('car_id').filter(lambda x: len(x) > 1)

# Checking the distribution of checkin types among these cars
multi_rental_checkin_type_distribution = multi_rental_cars['checkin_type'].value_counts()

multi_rental_checkin_type_distribution


checkin_type
mobile     13418
connect     4233
Name: count, dtype: int64

## Time Delta Between Rentals:

1,841 rentals have a record of time differences with previous rentals.
<br>The average time delta between the end of one rental and the start of the next is about 279 minutes (~4.6 hours).
<br>The median time delta is 180 minutes (~3 hours), indicating a common buffer between rentals.

## Impact of a threshold buffer time on the number of affected rentals based on the type of check-in ("connect" or "mobile")

In [473]:
thresholds = [x for x in range(0, 750, 30)]

In [474]:
results = []

for threshold in thresholds:
    
    # Rentals where time delta with the previous rental is less than the threshold
    impacted_rentals = df[(df['time_delta_with_previous_rental_in_minutes'] < threshold) & 
                            (df['time_delta_with_previous_rental_in_minutes'].notna())]
    
    # Split by check-in type
    mobile_affected = impacted_rentals[impacted_rentals['checkin_type'] == 'mobile'].shape[0]
    connect_affected = impacted_rentals[impacted_rentals['checkin_type'] == 'connect'].shape[0]
    
    results.append({
        'Threshold (minutes)': threshold,
        'Total Affected Rentals': impacted_rentals.shape[0],
        'Mobile Affected Rentals': mobile_affected,
        'Connect Affected Rentals': connect_affected
    })
    
df_results = pd.DataFrame(results)

In [475]:
# Create traces for the line and bar chart
line_trace = go.Scatter(
    x=thresholds, 
    y=df_results['Total Affected Rentals'], 
    mode='lines+markers', 
    name='Total Affected Rentals',
    line=dict(color='blue', width=2)
)

bar_mobile = go.Bar(
    x=thresholds, 
    y=df_results['Mobile Affected Rentals'], 
    name='Mobile Affected Rentals',
    marker_color='orange',
    width=12
)

bar_connect = go.Bar(
    x=thresholds, 
    y=df_results['Connect Affected Rentals'], 
    name='Connect Affected Rentals',
    marker_color='green',
    width=12
)

# Combine traces into one figure
fig = go.Figure()

# Add the line trace
fig.add_trace(line_trace)

# Add the bars side by side
fig.add_trace(bar_mobile)
fig.add_trace(bar_connect)

# Update layout for the plot
fig.update_layout(
    barmode='group',
    title='Threshold impact on rentals',
    xaxis_title='Threshold (minutes)',
    yaxis_title='Affected Rentals',
    legend=dict(x=0.1, y=1.1, orientation='h'),
    height=600,
    width=900
)

# Show the plot
fig.show()


The analysis shows how many rentals are impacted based on different buffer thresholds:

- At **30** minutes, **279** **rentals** would be affected, with **148** **mobile** rentals and **131** **connect** rentals.
- At **60** minutes, **401** **rentals** would be impacted, with **220** **mobile** and **181** **connect** rentals.
- At **180** minutes, **870** **rentals** would be impacted, with **498** **mobile** and **372** **connect** rentals.

This plot can help guide decisions on setting the buffer threshold and whether it should apply to all cars or only "connect" cars.
​