In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
import haversine as hs
import scipy.stats as st

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from faker import Faker
from faker.providers import DynamicProvider

pio.renderers.default = "notebook"

## Suppress scientific notation
pd.options.display.float_format = '{:.4f}'.format

import warnings
warnings.filterwarnings("ignore")

In [47]:
df = pd.read_csv('./dataset.csv')
df.head()

Unnamed: 0,DriverId,EventName,Latitude,Longitude,Speed km/h,ts
0,0,Timed Event,34.1866,-118.0881,64.0,2017-11-01 00:00:02.430
1,0,Distance Event,34.1861,-118.0892,53.0,2017-11-01 00:00:05.600
2,0,Distance Event,34.1864,-118.0896,34.0,2017-11-01 00:00:13.640
3,0,Distance Event,34.1875,-118.0889,33.0,2017-11-01 00:00:26.070
4,0,Distance Event,34.1887,-118.0865,32.0,2017-11-01 00:00:35.090


In [48]:
df['ts'] = pd.to_datetime(df['ts'])
df.head()

Unnamed: 0,DriverId,EventName,Latitude,Longitude,Speed km/h,ts
0,0,Timed Event,34.1866,-118.0881,64.0,2017-11-01 00:00:02.430
1,0,Distance Event,34.1861,-118.0892,53.0,2017-11-01 00:00:05.600
2,0,Distance Event,34.1864,-118.0896,34.0,2017-11-01 00:00:13.640
3,0,Distance Event,34.1875,-118.0889,33.0,2017-11-01 00:00:26.070
4,0,Distance Event,34.1887,-118.0865,32.0,2017-11-01 00:00:35.090


In [3]:
Faker.seed(0)

fake = Faker()

person = []

for i in range(89):
    
    first_name = fake.unique.first_name()
    last_name = fake.unique.last_name()
    address = fake.address()
    license_plate = fake.license_plate()
    birth_date = fake.date()

    

['Megan',
 'Katherine',
 'Robert',
 'Jonathan',
 'William',
 'Richard',
 'Kristen',
 'Kevin',
 'Thomas',
 'Brandy',
 'Rebecca',
 'Juan',
 'Katelyn',
 'Christine',
 'John',
 'Renee',
 'Tonya',
 'Lisa',
 'Rachel',
 'Kyle',
 'Jessica',
 'Gabriella',
 'Craig',
 'Ryan',
 'Cheryl',
 'Robin',
 'Tammy',
 'Michelle',
 'Jorge',
 'Linda',
 'Ana',
 'Alexis',
 'Jennifer',
 'Raymond',
 'Mallory',
 'Elizabeth',
 'Aaron',
 'Tristan',
 'Mikayla',
 'Mark',
 'Monica',
 'James',
 'Ashley',
 'Seth',
 'Christopher',
 'Luke',
 'Sarah',
 'Daniel',
 'Andrea',
 'Jasmine',
 'Amy',
 'Amber',
 'Tamara',
 'Carrie',
 'Briana',
 'Sean',
 'Caitlyn',
 'Paul',
 'Barbara',
 'Kelly',
 'Jaime',
 'Jacob',
 'Chloe',
 'Diane',
 'Nancy',
 'Sandra',
 'Rachael',
 'Peter',
 'Janet',
 'Joshua',
 'Melissa',
 'Patty',
 'Brittany',
 'Stephanie',
 'Bethany',
 'Shannon',
 'Donna',
 'Wendy',
 'Sabrina',
 'Laura',
 'Tom',
 'Crystal',
 'Dave',
 'Jeffrey',
 'Marvin',
 'Dustin',
 'Heidi',
 'Brandon',
 'Audrey']

In [49]:
eventsPerDriver = df.groupby('DriverId').agg({"EventName":"count"}).add_suffix('_count')
averageNoEvents = np.mean(eventsPerDriver).values[0].round(1)
averageNoEvents

10108.0

In [50]:
eventsPerDriver.sort_values(by='EventName_count', ascending=False, inplace=True)
eventsPerDriver.head()

Unnamed: 0_level_0,EventName_count
DriverId,Unnamed: 1_level_1
49,31942
77,30420
45,30319
51,26917
5,24152


In [51]:
eventsPerDriver.EventName_count.max()

31942

In [52]:
eventsPerDriver.EventName_count.min()

239

In [53]:
dfReasonHist = df.groupby(['EventName'])[['EventName']].agg('count').add_suffix('_Count').reset_index().sort_values('EventName_Count',ascending=True)
dfReasonHist.head()

Unnamed: 0,EventName,EventName_Count
8,Harsh Turn Left (motion based),805
4,Harsh Acceleration (motion based),808
9,Harsh Turn Right (motion based),853
6,Harsh Braking (motion based),1838
7,Harsh Turn (motion based),2298


In [56]:
fig = px.bar(dfReasonHist, orientation='h', x='EventName_Count', barmode='group')

fig.update_layout(
    xaxis_title="Number of Events",
    yaxis_title="Event Name",
    legend_title="EventName",
    title = "Bar Chart of Event Distribution",
    template="plotly_white",
    legend_title_text="EventName",
)

fig.show()

Removal of non-behavioral events

In [57]:
non_behavioral_events = [event for event in df.EventName if event not in ['Distance Event','Timed Event','Network Event','System Event']]
newCount = len(df[df['EventName'].isin(non_behavioral_events)])
print("# of events before removing Distance Event, Timed Event, Network Event: {}, After removing: {}.\nReduction of {:0.2f} events. Current number of events: {}".format(len(df),newCount,(len(df) - newCount)/len(df),newCount))

# of events before removing Distance Event, Timed Event, Network Event: 899611, After removing: 54242.
Reduction of 0.94 events. Current number of events: 54242


In [58]:
behavioral_events = ['Harsh Acceleration', 'Reached max speed', 'Out of max speed',
       'Harsh Braking', 'Harsh Turn (motion based)',
       'Harsh Braking (motion based)', 'Harsh Acceleration (motion based)',
       'Harsh Turn Left (motion based)', 'Harsh Turn Right (motion based)']


def prepData(df, minRecordsPerSubscriber = 50):
    df.reset_index(inplace=True)
    print("*** Starting data prep. Length:", len(df),"***")
  
    
    #Remove NAs
    dfRaw = df.dropna()
    print("Removed NAs. Length:",len(dfRaw))

    ## Filter out unwanted events
    df = df[df.EventName.isin(behavioral_events)]
    print("Keeping only events that are relevant for modeling. Length:",len(df))
    
    
    ## Filter out users with too few samples
    eventCountPerDriver = df.groupby('DriverId')['DriverId'].agg('count')
    driversWithManyRecords = eventCountPerDriver[eventCountPerDriver > minRecordsPerSubscriber]
    driversWithManyRecords.keys()
    df = df[df.DriverId.isin(driversWithManyRecords.keys())]
    print("Filtering users with too few samples. Length:",len(df))

    print("*** Done. ***")
    return(df)

df = prepData(df)

df.head()

*** Starting data prep. Length: 899611 ***
Removed NAs. Length: 899611
Keeping only events that are relevant for modeling. Length: 36468
Filtering users with too few samples. Length: 36262
*** Done. ***


Unnamed: 0,index,DriverId,EventName,Latitude,Longitude,Speed km/h,ts
41,41,0,Harsh Acceleration,34.1899,-118.0828,49.0,2017-11-01 14:30:12.120
112,112,0,Reached max speed,34.1802,-118.1362,115.0,2017-11-01 14:35:26.830
130,130,0,Out of max speed,34.1689,-118.144,69.0,2017-11-01 14:36:19.710
149,149,0,Harsh Braking,34.1591,-118.1414,98.0,2017-11-01 14:38:01.930
186,186,0,Reached max speed,34.1375,-118.1473,122.0,2017-11-01 14:39:59.440
