## Uber Prediction for Analyzing New Drivers

### By Kento Morita, Vincent Perez, Aliel Liang, Phuong Duong

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

### Data Wrangling
#### Since we created new variables in Excel(name: driver_signup_processed.csv) beforehand, please let us write down all of columns which created in Excel process
- days_to_bgc: The gap of day difference between signup_date and bgc_date
- days_to_vehicle_add: The gap of day difference between signup_date and vehicle_added_date
- days_to_first_trip: The gap of day difference between signup_date and first_completed_date
- started_driving: Convert binary variable if first_completed_date has 1, otherwise 0.

In [9]:
data = pd.read_csv('./dataset/driver_signup_processed.csv')

In [10]:
#explore dataset features
data.info()
data.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54681 entries, 0 to 54680
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    54681 non-null  int64  
 1   city_name             54681 non-null  object 
 2   signup_os             47824 non-null  object 
 3   signup_channel        54681 non-null  object 
 4   signup_date           54681 non-null  object 
 5   bgc_date              32896 non-null  object 
 6   vehicle_added_date    13134 non-null  object 
 7   vehicle_make          13223 non-null  object 
 8   vehicle_model         13223 non-null  object 
 9   vehicle_year          13223 non-null  float64
 10  first_completed_date  6137 non-null   object 
 11  days_to_bgc           32896 non-null  float64
 12  days_to_vehicle_add   13134 non-null  float64
 13  days_to_first_trip    6137 non-null   float64
 14  started_driving       54681 non-null  int64  
dtypes: float64(4), int6

(54681, 15)

In [11]:
#null value counts
data.isna().sum()

id                          0
city_name                   0
signup_os                6857
signup_channel              0
signup_date                 0
bgc_date                21785
vehicle_added_date      41547
vehicle_make            41458
vehicle_model           41458
vehicle_year            41458
first_completed_date    48544
days_to_bgc             21785
days_to_vehicle_add     41547
days_to_first_trip      48544
started_driving             0
dtype: int64

In [12]:
# Impute missing values
## Since we have to handle the number of days and missing value differently, we put -1 for missing value in categorical variable.
from sklearn.impute import SimpleImputer
num_cols = ['days_to_bgc', 'days_to_vehicle_add']
cat_cols = ['signup_os', 'signup_channel', 'city_name']

data[num_cols] = SimpleImputer(strategy='constant', fill_value=-1).fit_transform(data[num_cols])
data[cat_cols] = SimpleImputer(strategy='constant', fill_value=0).fit_transform(data[cat_cols])

In [15]:
# groupby each signup device and channel to compute the total number of new drivers within the timeframe
data[data['signup_os']!=0].groupby(['signup_os','signup_channel']).size().to_frame('number_drivers')

Unnamed: 0_level_0,Unnamed: 1_level_0,number_drivers
signup_os,signup_channel,Unnamed: 2_level_1
android web,Organic,2786
android web,Paid,7871
android web,Referral,4287
ios web,Organic,3527
ios web,Paid,7723
ios web,Referral,5382
mac,Organic,1796
mac,Paid,2615
mac,Referral,1413
other,Organic,925
