# Libraries

In [1]:
import pandas as pd 

import plotly.express as px 
from plotly.subplots import make_subplots
import plotly.graph_objects as go

Dashboard Link: https://fmendes13-getaround-dashboard.hf.space

In [4]:
# Pour le deploiement sur hugging face - besoin de transformer la base en CSV
# df=pd.read_excel('DATA ANALYSIS get_around_delay_analysis.xlsx')
#df.to_csv("getaround_data.csv", index=False)

# Data Analysis

## Source Check

In [2]:
get_around_data = pd.read_excel('DATA ANALYSIS get_around_delay_analysis.xlsx')
get_around_data.head()

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes
0,505000,363965,mobile,canceled,,,
1,507750,269550,mobile,ended,-81.0,,
2,508131,359049,connect,ended,70.0,,
3,508865,299063,connect,canceled,,,
4,511440,313932,mobile,ended,,,


In [3]:
get_around_data.tail()

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes
21305,573446,380069,mobile,ended,,573429.0,300.0
21306,573790,341965,mobile,ended,-337.0,,
21307,573791,364890,mobile,ended,144.0,,
21308,574852,362531,connect,ended,-76.0,,
21309,575056,351549,connect,ended,35.0,,


In [4]:
print('Dataset shape:')
display(get_around_data.shape)
print('\n')

print('Basics statistics:')
display(get_around_data.info())
print('\n')
display(get_around_data.describe(include='all'))
print('\n')

print('Percentage of missing values:')
display(100 * get_around_data.isnull().sum() / get_around_data.shape[0])

Dataset shape:


(21310, 7)



Basics statistics:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21310 entries, 0 to 21309
Data columns (total 7 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   rental_id                                   21310 non-null  int64  
 1   car_id                                      21310 non-null  int64  
 2   checkin_type                                21310 non-null  object 
 3   state                                       21310 non-null  object 
 4   delay_at_checkout_in_minutes                16346 non-null  float64
 5   previous_ended_rental_id                    1841 non-null   float64
 6   time_delta_with_previous_rental_in_minutes  1841 non-null   float64
dtypes: float64(3), int64(2), object(2)
memory usage: 1.1+ MB


None





Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes
count,21310.0,21310.0,21310,21310,16346.0,1841.0,1841.0
unique,,,2,2,,,
top,,,mobile,ended,,,
freq,,,17003,18045,,,
mean,549712.880338,350030.603426,,,59.701517,550127.411733,279.28843
std,13863.446964,58206.249765,,,1002.561635,13184.023111,254.594486
min,504806.0,159250.0,,,-22433.0,505628.0,0.0
25%,540613.25,317639.0,,,-36.0,540896.0,60.0
50%,550350.0,368717.0,,,9.0,550567.0,180.0
75%,560468.5,394928.0,,,67.0,560823.0,540.0




Percentage of missing values:


rental_id                                      0.000000
car_id                                         0.000000
checkin_type                                   0.000000
state                                          0.000000
delay_at_checkout_in_minutes                  23.294228
previous_ended_rental_id                      91.360863
time_delta_with_previous_rental_in_minutes    91.360863
dtype: float64

In [5]:
#Drop
#rental_id - identifiant unique / pas pertinent pour le machine learning
#previous_ended_rental_id / pourcentage de données manquante trop élevé pour être pertinente 
#time_delta_with_previous_rental_in_minutes / pourcentage de données manquante trop élevé pour être pertinente 

## EDA

In [6]:
get_around_data.head()

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes
0,505000,363965,mobile,canceled,,,
1,507750,269550,mobile,ended,-81.0,,
2,508131,359049,connect,ended,70.0,,
3,508865,299063,connect,canceled,,,
4,511440,313932,mobile,ended,,,


In [7]:
fig = make_subplots(rows=2,cols=1)
fig.add_trace(
    go.Histogram(
        x = get_around_data['checkin_type'],
        name = 'Checkin Type'),
    row=1,
    col=1)

fig.add_trace(
    go.Histogram(
        x = get_around_data['state'],
        name = 'State'),
    row=2,
    col=1)

fig.update_layout(width=700,height=1000)
fig.show()

In [8]:
fig = px.histogram(get_around_data, x='delay_at_checkout_in_minutes', color='checkin_type', nbins=30)
fig.show()

In [9]:
filtered_data = get_around_data[get_around_data['checkin_type'] == 'connect']
fig=px.histogram(data_frame=filtered_data,x='delay_at_checkout_in_minutes',marginal='violin')
fig.show()

In [10]:
filtered_data = get_around_data[get_around_data['checkin_type'] == 'mobile']
fig=px.histogram(data_frame=filtered_data,x='delay_at_checkout_in_minutes',marginal='violin')
fig.show()

## Which share of our owner’s revenue would potentially be affected by the feature?

In [11]:
get_around_data.head()

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes
0,505000,363965,mobile,canceled,,,
1,507750,269550,mobile,ended,-81.0,,
2,508131,359049,connect,ended,70.0,,
3,508865,299063,connect,canceled,,,
4,511440,313932,mobile,ended,,,


In [12]:
# Identifier la part des locations potentiellement impactés par un [temps de min entre deux locations] - entre ceux qui rendent en retard et ceux qui sont trop en avance.
threshold = 60

In [13]:
get_around_data_ended=get_around_data[get_around_data['state']=='ended']
get_around_data_ended['affected_by_threshold']=get_around_data_ended['time_delta_with_previous_rental_in_minutes']<threshold



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [18]:
get_around_data_ended.head()

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes,affected_by_threshold,late_impact_next_driver
1,507750,269550,mobile,ended,-81.0,,,False,False
2,508131,359049,connect,ended,70.0,,,False,False
4,511440,313932,mobile,ended,,,,False,False
5,511626,398802,mobile,ended,-203.0,,,False,False
6,511639,370585,connect,ended,-15.0,563782.0,570.0,False,False


In [15]:
share_affected=(get_around_data_ended['affected_by_threshold'].sum()/len(get_around_data_ended['affected_by_threshold']))*100
print(f'Pourcentage de locations touchées {round(share_affected,2)} %')

Pourcentage de locations touchées 1.98 %


## How many rentals would be affected by the feature depending on the threshold and scope we choose?

In [16]:
nb_affected=get_around_data_ended['affected_by_threshold'].sum()
print(f'Le nombre de locations touchées est de {nb_affected}')

GA_connect=get_around_data_ended[get_around_data_ended['checkin_type']=='connect']
GA_connect_threshold=GA_connect['affected_by_threshold'].sum()
print(f'Le nombre de locations CONNNECT touchées est de {GA_connect_threshold} - Le nombre de locations MOBILE touchées est de {nb_affected-GA_connect_threshold}')


Le nombre de locations touchées est de 358
Le nombre de locations CONNNECT touchées est de 156 - Le nombre de locations MOBILE touchées est de 202


## How often are drivers late for the next check-in? How does it impact the next driver?

In [17]:
# on doit déterminer combien de fois le retard au check out est supérieur l'écart de temps avec la location précédente. 
get_around_data_ended['late_impact_next_driver']=(get_around_data_ended['delay_at_checkout_in_minutes']>get_around_data_ended['time_delta_with_previous_rental_in_minutes'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [29]:
late_impact_next_driver=get_around_data_ended['late_impact_next_driver'].sum()
share_late_impact=late_impact_next_driver/len(get_around_data_ended['late_impact_next_driver'])*100

get_around_data_ended_late=get_around_data_ended[get_around_data_ended['late_impact_next_driver']]
late_impact_in_minute = (get_around_data_ended_late['delay_at_checkout_in_minutes'] - get_around_data_ended_late['time_delta_with_previous_rental_in_minutes']).mean()

print(f'Pourcentage de locations touchées par un retard de checkout précédent leur location est de {round(share_late_impact,2)} %, soit {late_impact_next_driver} retards, pour une moyenne de {round(late_impact_in_minute,2)} minutes.')

Pourcentage de locations touchées par un retard de checkout précédent leur location est de 1.5 %, soit 270 retards, pour une moyenne de 289.2 minutes.


## How many problematic cases will it solve depending on the chosen threshold and scope?

In [30]:
# Dans le cas où on a un retard de check out supérieur au temps fixé avant la prochaine location,
# on va faire le point sur le nombre de location avec un temps delta qui est inférieur au threshold fixé
# cela donnera le nombre de cas problématique qu'on évite avec cette nouvelle méthodologie

get_around_data_ended_late.head()

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes,affected_by_threshold,late_impact_next_driver
90,535770,352436,mobile,ended,74.0,524703.0,60.0,False,True
107,537576,397470,mobile,ended,18.0,539005.0,0.0,True,True
148,540479,374684,mobile,ended,12.0,539751.0,0.0,True,True
164,541862,382364,mobile,ended,125.0,540607.0,0.0,True,True
206,543808,369230,mobile,ended,75.0,536315.0,60.0,False,True


In [35]:
get_around_data_ended_late['location_problem avoided'] = get_around_data_ended_late['time_delta_with_previous_rental_in_minutes']<threshold
avoided=get_around_data_ended_late['location_problem avoided'].sum()
print(f'La solution threshold permettrait d/éviter {avoided} problèmes de locations')

La solution threshold permettrait d/éviter 176 problèmes de locations




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [37]:
get_around_data_ended_late_connect=get_around_data_ended_late[get_around_data_ended_late['checkin_type']=='connect']
avoided_connect=get_around_data_ended_late_connect['location_problem avoided'].sum()
print(f'La solution threshold permettrait d/éviter {avoided_connect} problèmes de locations pour les CONNECT et {avoided-avoided_connect} pour les MOBILE.')

La solution threshold permettrait d/éviter 63 problèmes de locations pour les CONNECT et 113 pour les MOBILE.
