In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/track.csv')
df.shape[0], df.columns

(11021,
 Index(['id', 'start_date', 'avg_speed', 'end_date', 'length', 'max_speed',
        'norm_fuel_consumed', 'type', 'end_address', 'start_address', 'points',
        'tracker_id'],
       dtype='object'))

In [3]:
# calculate duration and average speed for each track 
df['duration'] = (pd.to_datetime(df.end_date) - pd.to_datetime(df.start_date)).dt.total_seconds() / 3600
df['avg_speed_calculated'] = df.length / df['duration']

In [4]:
# no tracks with duration > 24 hours
df['duration'].max()

5.948888888888889

In [5]:
# check if there are mistakes in the start/end dates
df[df.end_date <= df.start_date]

Unnamed: 0,id,start_date,avg_speed,end_date,length,max_speed,norm_fuel_consumed,type,end_address,start_address,points,tracker_id,duration,avg_speed_calculated


In [6]:
# check if track ids are unique and are not reused for different trackers tracks
df.loc[df.duplicated(subset='id', keep=False), ['id', 'tracker_id']].sort_values(by='id').head(6)

Unnamed: 0,id,tracker_id
4751,3468.0,3036047
5905,3468.0,3036049
5906,3469.0,3036049
4752,3469.0,3036047
4753,3470.0,3036047
5907,3470.0,3036049


Track ids are still usable since they are not duplicated for the same tracker

In [7]:
df.loc[df.duplicated(subset=['id', 'tracker_id'], keep=False), ['id', 'tracker_id']].sort_values(by='id')

Unnamed: 0,id,tracker_id


There are only 2 trackers that have information on fuel consumed, still it would be interesting to have this information visualised.

In [8]:
df[df.norm_fuel_consumed.notna()].tracker_id.value_counts()

tracker_id
877766    1100
877767     487
Name: count, dtype: int64

We want to get statistics on the number of active and finished tracks for each day, distance traveled, duration (active and off-track time), and average speed for trackers separately for each day.

In [9]:
# create a column with the dates when tracks were finished
df['track_finished_date'] = pd.to_datetime(df['end_date']).dt.date

In [10]:
# select tracks that start on one day and end on another
# we need to split each of them in 2 parts (one for each day)
start = df[pd.to_datetime(df.start_date).dt.day != pd.to_datetime(df.end_date).dt.day].copy()
end = df[pd.to_datetime(df.start_date).dt.day != pd.to_datetime(df.end_date).dt.day].copy()
start['end_date'] = start['start_date'].apply(lambda x: x[:11] + '23:59:59')
end['start_date'] = end['end_date'].apply(lambda x: x[:11] + '00:00:00')

In [11]:

start['duration'] = (pd.to_datetime(start.end_date) - pd.to_datetime(start.start_date)).dt.total_seconds() / 3600
end['duration'] = (pd.to_datetime(end.end_date) - pd.to_datetime(end.start_date)).dt.total_seconds() / 3600

# using the duration data and accurately calculated average speed, 
# we can determine the distance traveled for tracks that span two days, broken down by each day
start['length_split'] = round(start['duration'] * start['avg_speed_calculated'], 2)
end['length_split'] = round(end['duration'] * end['avg_speed_calculated'], 2)

# we should note that there is some degree of inaccuracy in these calculations
# because we assume that vehicle moved with the same average speed during the whole track
# same with fuel 
start['norm_fuel_consumed'] = round(start['norm_fuel_consumed'] * start['length_split'] / start['length'], 2)
end['norm_fuel_consumed'] = round(end['norm_fuel_consumed'] * end['length_split'] / end['length'], 2)

# we don't need original (full) track length any more
start = start.drop(columns={'length'}).rename(columns={'length_split' : 'length'})
end = end.drop(columns={'length'}).rename(columns={'length_split' : 'length'})

In [12]:
# create a stats dataframe with aggregated data on the number of completed tracks for each tracker per day
stats = (df.groupby(['track_finished_date', 'tracker_id'])
         .agg({'id': 'count'})
         .reset_index()
         .rename(columns={'id':'count_finished', 'track_finished_date': 'date'}))

In [13]:
df.columns

Index(['id', 'start_date', 'avg_speed', 'end_date', 'length', 'max_speed',
       'norm_fuel_consumed', 'type', 'end_address', 'start_address', 'points',
       'tracker_id', 'duration', 'avg_speed_calculated',
       'track_finished_date'],
      dtype='object')

In [14]:
start = start[['id', 'start_date', 'avg_speed', 'end_date', 'length', 'max_speed',
        'norm_fuel_consumed', 'type', 'end_address', 'start_address', 'points',
        'tracker_id', 'duration', 'avg_speed_calculated',
        'track_finished_date']]
end = end[['id', 'start_date', 'avg_speed', 'end_date', 'length', 'max_speed',
        'norm_fuel_consumed', 'type', 'end_address', 'start_address', 'points',
        'tracker_id', 'duration', 'avg_speed_calculated',
        'track_finished_date']]

In [15]:
# replace the entries of tracks that span two days with two separate entries: one for the start day and another for the end day
df = pd.concat([df[pd.to_datetime(df.start_date).dt.day == pd.to_datetime(df.end_date).dt.day],
                start, 
                end]
              ).reset_index(drop=True)


In [16]:
# add number of active tracks to the stats df
df['track_active_date'] = pd.to_datetime(df['start_date']).dt.date
stats = stats.merge(df.groupby(['track_active_date', 'tracker_id'])
                    .agg({'id': 'count'})
                    .reset_index()
                    .rename(columns={'id':'count_active', 'track_active_date':'date'}), 
                            on=['date', 'tracker_id'], 
                            how='outer')

In [17]:
# add aggregated length, duration, off-track time and average speed
stats = stats.merge(df.groupby(['track_active_date', 'tracker_id'])
                    .agg({'length': 'sum'})
                    .reset_index()
                    .rename(columns={'track_active_date':'date'}), 
                            on=['date', 'tracker_id'], 
                            how='outer')
stats = stats.merge(df.groupby(['track_active_date', 'tracker_id'])
                    .agg({'duration': 'sum'})
                    .reset_index()
                    .rename(columns={'track_active_date':'date'}), 
                            on=['date', 'tracker_id'], 
                            how='outer')
stats['offtrack_time'] = 24 - stats['duration']
stats['avg_speed'] = stats['length'] / stats['duration']

In [18]:
stats

Unnamed: 0,date,tracker_id,count_finished,count_active,length,duration,offtrack_time,avg_speed
0,2024-06-24,877766,47,47,739.75,16.796389,7.203611,44.042205
1,2024-06-24,877767,20,21,856.61,19.289444,4.710556,44.408226
2,2024-06-24,877768,68,69,829.97,19.137222,4.862778,43.369408
3,2024-06-24,3036043,53,54,306.53,7.116389,16.883611,43.073812
4,2024-06-24,3036045,1,1,301.00,2.974722,21.025278,101.185918
...,...,...,...,...,...,...,...,...
367,2024-07-23,3036043,14,14,73.94,1.783056,22.216944,41.468141
368,2024-07-23,3036056,13,13,92.33,2.113611,21.886389,43.683533
369,2024-07-23,3036057,1,1,64.96,0.665556,23.334444,97.602671
370,2024-07-23,3036068,13,13,92.23,2.125556,21.874444,43.391009


Let's compare the aggregated distances and mileage, obtained from mileage endpoint

In [19]:
dfm = pd.read_csv('data/mileage.csv')
dfm = (dfm.sort_values(by='date').reset_index(drop=True).astype({'date':'str'})
        .merge(stats[['date', 'tracker_id', 'length', 'count_finished', 'count_active']].sort_values(by='date').reset_index(drop=True).astype({'date':'str'}), 
                on=['tracker_id', 'date'], how='outer'))
dfm['length_diff'] = abs(dfm.mileage - dfm.length)
dfm[dfm['length_diff'] > 2].sort_values(by='length_diff', ascending=False).head(10)

Unnamed: 0,tracker_id,date,mileage,length,count_finished,count_active,length_diff
271,3036057,2024-06-25,1363.25,1111.88,3.0,3.0,251.37
294,3036057,2024-07-18,1385.05,1144.22,4.0,4.0,240.83
278,3036057,2024-07-02,1330.81,1091.56,3.0,3.0,239.25
414,3036069,2024-07-18,1190.73,987.96,6.0,6.0,202.77
412,3036069,2024-07-16,1235.33,1069.83,6.0,6.0,165.5
404,3036069,2024-07-08,1075.85,923.74,4.0,4.0,152.11
288,3036057,2024-07-12,1143.88,993.86,3.0,3.0,150.02
408,3036069,2024-07-12,1040.31,903.6,4.0,4.0,136.71
172,3036045,2024-07-16,549.5,429.11,2.0,2.0,120.39
284,3036057,2024-07-08,1531.56,1428.01,4.0,4.0,103.55


There are occurrences of discrepancies between the aggregated daily distance traveled and the daily mileage for each tracker

In [20]:
dfm[dfm['length_diff'] > 2].tracker_id.nunique()

14

When examining the tracks for a day with discrepancies between mileage and the sum of track lengths, we see that the end address of one track and the start address of the next track don't always coincide. Some vehicle travels might occur but are not recorded as tracks.

In [21]:
with pd.option_context("display.max_rows", 1200, "display.max_columns", 1000, 'display.max_colwidth', 1000):
    display(df.loc[df.start_date.str.contains('2024-07-18') & (df.tracker_id == 3036057), 
            ['id', 'start_date', 'end_date', 'length', 'start_address', 'end_address']])

Unnamed: 0,id,start_date,end_date,length,start_address,end_address
7252,360.0,2024-07-18 05:15:26,2024-07-18 09:31:29,411.83,"Carretera Matehuala-San Luis Potosí, San Luis Potosí City, San Luis Potosí, Mexico, 78319","Avenida Constituyentes - General José Montesinos, Avenida Constituyentes, Mexico City, Miguel Hidalgo, Mexico, 11100"
7253,361.0,2024-07-18 09:31:34,2024-07-18 09:53:48,27.33,"Avenida Constituyentes - General José Montesinos, Avenida Constituyentes, Mexico City, Miguel Hidalgo, Mexico, 11100","Viaducto Bicentenario, Tlalnepantla, State of Mexico, Mexico, 54015"
7254,362.0,2024-07-18 11:06:02,2024-07-18 14:10:19,297.18,"Carretera Querétaro - San Luis Potosí, Punto Blanco Dos, Guanajuato, Mexico, 37914","Avenida Constituyentes - General José Montesinos, Avenida Constituyentes, Mexico City, Miguel Hidalgo, Mexico, 11100"
7255,363.0,2024-07-18 14:10:24,2024-07-18 18:21:12,407.88,"Avenida Constituyentes - General José Montesinos, Avenida Constituyentes, Mexico City, Miguel Hidalgo, Mexico, 11100","Calle José María Mercado, San Luis Potosí City, San Luis Potosí, Mexico, 78319"


In [22]:
with pd.option_context("display.max_rows", 1200, "display.max_columns", 1000, 'display.max_colwidth', 1000):
    display(df.loc[df.start_date.str.contains('2024-07-08') & (df.tracker_id == 3036069), 
            ['id', 'start_date', 'end_date', 'length', 'start_address', 'end_address']])

Unnamed: 0,id,start_date,end_date,length,start_address,end_address
9203,905.0,2024-07-08 07:41:26,2024-07-08 10:47:26,246.69,"A 3, Kitzingen, Bavaria, Germany, 97318","A 3, Dernbach, Puderbach, Landkreis Neuwied, Rhineland-Palatinate, Germany, 56307"
9204,906.0,2024-07-08 11:33:03,2024-07-08 14:34:30,235.62,"A 3, Bischbrunn, Verwaltungsgemeinschaft Marktheidenfeld, Bavaria, Germany, 97836","A 3, Rösrath, North Rhine-Westphalia, Germany, 51503"
9205,907.0,2024-07-08 14:39:00,2024-07-08 14:49:41,1.7,"A 3, Rösrath, North Rhine-Westphalia, Germany, 51503","A 3, Rösrath, North Rhine-Westphalia, Germany, 51503"
9206,908.0,2024-07-08 14:50:21,2024-07-08 19:43:52,439.73,"A 3, Rösrath, North Rhine-Westphalia, Germany, 51503","A 3, Kitzingen, Bavaria, Germany, 97318"


In [23]:
stats.to_csv('data/stats.csv', index=False)