In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install geopandas



In [3]:
from datetime import datetime, timedelta

In [4]:
"""GPS datapreprocessing & Trip extraction

Bus trip extraction from GPS data

Importing python libraries
"""

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime,date
import geopandas as gpd
from geopandas import GeoDataFrame as gdf
from google.colab import files
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster

import glob
import os

path_raw_data = '/content/drive/MyDrive/Data_sets/Raw-GPS-data-Kandy-Buses/digana_2021_10.csv'
path_bus_terminals = '/content/drive/MyDrive/Data_sets/Raw-GPS-data-Kandy-Buses/more/bus_terminals_654.csv'
def download_csv(data,filename):

  """
    To download output as CSV files

    Args:
        data (pd.DataFrame): DataFrame Object.
        filename (str): Name of the file has to be faved.

    Returns:
        None
  """

  filename= filename + '.csv'
  data.to_csv(filename, encoding = 'utf-8-sig',index= False)
  files.download(filename)

def get_data_from_drive(path):

  """
    Get csv file from given file path.

    Args:
        path (str): Location for the file.

    Returns:
        data (pd.DataFrame): A DataFrame Object of given file path.
    """

  data = pd.read_csv(path)
  return data

# raw_data = pd.read_csv(path_raw_data)
raw_data = get_data_from_drive(path_raw_data)
# bus_terminals= pd.read_csv(path_bus_terminals)
bus_terminals = get_data_from_drive(path_bus_terminals)

def raw_data_cleaning(raw_data):

  """
    Removal of records with error records.
    Remove data with zero values for longitude and latitude columns.
    Sort data by time and device.

    Args:
        raw_data (pd.DataFrame): Crude raw GPS data filtered out from the server for the required time window.

    Returns:
        gps_data (pd.DataFrame): A cleaned dataframe object of GPS data.
    """

  #raw_data = raw_data.drop(drop_columns, axis = 1)

  gps_data = raw_data[raw_data.latitude != 0]
  gps_data = gps_data[gps_data.longitude != 0] #cleaning zero values for latitude & longitude

  gps_data['date'] = pd.to_datetime(gps_data['devicetime']).dt.date #split date and time separately into datetime variables
  gps_data['time'] = pd.to_datetime(gps_data['devicetime']).dt.time

  gps_data = gps_data.sort_values(['deviceid', 'date', 'time']) #sorting dataset by time and device

  return gps_data

#The additional unwanted columns from the dataset are found to be deleted(Optional Step)
additional_columns = ['servertime','fixtime','address','routeid']

#drop_columns = ['servertime','fixtime','address','routeid']
gps_data= raw_data_cleaning(raw_data)
# Remove columns
gps_data=gps_data.drop(additional_columns, axis=1)
# download_csv(gps_data,'clearn')

def trip_ends(gps_data,bus_terminals,end_buffer):

  """
    To extract trip ends dataframe with given buffer range.
    Filter the records within terminals selected buffer range.
    Within the filtered records get entry & exit to terminals.


    Args:
        gps_data (pd.DataFrame): Cleaned gps data filtered out from the server for the required time window.
        bus_terminals (pd.DataFrame): End and start terminals for the trip.
        end_buffer (int):  Radius of the buffer area to represent terminals.

    Returns:
        trip_ends (pd.DataFrame): Trip data with extracted terminals.
  """

  #converting to GeoDataframe with Coordinate Reference system 4326
  gps_data = gpd.GeoDataFrame(gps_data, geometry=gpd.points_from_xy(gps_data.longitude,gps_data.latitude),crs='EPSG:4326')
  bus_terminals = gpd.GeoDataFrame(bus_terminals, geometry=gpd.points_from_xy(bus_terminals.longitude,bus_terminals.latitude),crs='EPSG:4326')

  #project them in local cordinate system
  gps_data = gps_data.to_crs('EPSG:5234')
  bus_terminals = bus_terminals.to_crs('EPSG:5234')

  #creating buffer area to extract records around bus terminals
  bus_terminals_buffer = gpd.GeoDataFrame(bus_terminals, geometry = bus_terminals.geometry.buffer(end_buffer))

  #filtering coordinates within bus terminals end buffer
  gps_data['bus_stop'] = pd.Series(dtype='object') #create a new column in gps data set
  gps_data.reset_index(drop = True, inplace = True) #reset indices to run a for loop

  for i in range(len(gps_data)):
    for stop in range(len(bus_terminals)):
      if bus_terminals_buffer.iloc[stop].geometry.contains(gps_data.iloc[i].geometry):
        gps_data.at[i,'bus_stop'] = bus_terminals.at[stop,'terminal_id']

  trip_ends = gps_data.dropna() #filter records within terminal buffer

  #EXTRACT TRIP ENDS

  #grouping the filtered records of one bus terminal and one date
  trip_ends['grouped_ends'] = ((trip_ends['bus_stop'].shift() != trip_ends['bus_stop']) | (trip_ends['date'].shift() != trip_ends['date'])).cumsum()

  #find the entry or exit record only of the terminals
  #Early records is the entry(1) to the terminal and last record as the exit(0) to the end terminal
  trip_ends['entry/exit'] = pd.Series(dtype='object')
  trip_ends = trip_ends.reset_index(drop=True)

  for name, group in trip_ends.groupby('grouped_ends'):
    #if 0 in group['speed'].values:
    for index, row in group.iterrows():
      if row['devicetime'] == group['devicetime'].max():
        trip_ends.at[index,'entry/exit'] = '0'
      elif row['devicetime'] == group['devicetime'].min():
        trip_ends.at[index,'entry/exit'] = '1'

  trip_ends = trip_ends.dropna() #filter terminal entry/exit records only

  trip_ends = trip_ends.reset_index(drop=True)

  #Providing unique trip id for trips which have entry / exit values within the 2 bus end terminals
  trip = 0
  for i in range(len(trip_ends)-1):
    if (trip_ends.at[i,'bus_stop'] != trip_ends.at[i+1,'bus_stop']) & (trip_ends.at[i,'date'] == trip_ends.at[i+1,'date']):
      trip= trip+1
      trip_ends.at[i,'trip_id'] = trip
      trip_ends.at[i+1,'trip_id'] = trip

  trip_ends = trip_ends.dropna()

  trip_ends = trip_ends.groupby('trip_id').filter(lambda x : len(x)>1)    #remove outliers where no defined 2 trip ends for a trip
  trip_ends = trip_ends.reset_index(drop=True)

  return trip_ends

end_buffer = 100
trip_ends = trip_ends(gps_data,bus_terminals,end_buffer)
# download_csv(trip_ends,'trip_ends')


def trip_extraction(trip_ends):

  """
    To extract bus trips with derived columns.
    Create end_time, end_terminal for a bus trip.
    Create features of duration, duration_in_mins, day_of_the_week, hour_of_day

    Args:
        trip_ends (pd.DataFrame): Filtered bus trip data with terminals.

    Returns:
        bus_trips (pd.DataFrame): Bus trip terminals data with derived features.
  """

  bus_trips = trip_ends.copy()
  bus_trips[['end_time','end_terminal']] = bus_trips[['time','bus_stop']].shift(-1)
  bus_trips = bus_trips.iloc[::2]

  bus_trips = bus_trips.drop(['id','devicetime','latitude','longitude','speed','geometry','grouped_ends','entry/exit'],axis=1)
  bus_trips.insert(0,'trip_id',bus_trips.pop('trip_id'))
  bus_trips.rename(columns = {'time':'start_time','bus_stop': 'start_terminal'}, inplace =True)

  conditions = [(bus_trips['start_terminal'] == 'BT01'),
              (bus_trips['start_terminal'] == 'BT02')]
  values = [1,2]

  bus_trips['direction'] = np.select(conditions, values)

  bus_trips = bus_trips[['trip_id','deviceid','date','start_terminal','end_terminal','direction','start_time','end_time']]
  bus_trips=bus_trips.reset_index(drop = True)

  #Calculate trip duration
  bus_trips['duration'] = pd.Series(dtype='object')
  for i in range(len(bus_trips)):
    bus_trips.at[i,'duration'] = datetime.combine(date.min,bus_trips.at[i,'end_time']) - datetime.combine(date.min,bus_trips.at[i,'start_time'])

  bus_trips['duration_in_mins'] = bus_trips['duration']/np.timedelta64(1,'m')

  bus_trips['day_of_week'] = pd.to_datetime(bus_trips['date']).dt.weekday
  bus_trips['hour_of_day'] = list(map(lambda  x: x.hour, (bus_trips['start_time'])))

  return bus_trips

bus_trips = trip_extraction(trip_ends)
# download_csv(bus_trips,'bus_trips')



def map_visualization(gps_data,city_location,bus_terminals,bus_terminals_buffer):

  """
    Using a  GPS data visualization package of Folium, project the coordinates on
    Open Street Map (OSM) to explore how the records are spread and to gain some insights and overview.

    Args:
        gps_data (pd.DataFrame): GPS data with selected device ID.
        city_location (arr): Longtitude and lattitude of a city
        bus_terminals (GeoDataFrame) : Bus terminal data with geometry column
        bus_terminals_buffer (GeoDataFrame) :  Bus terminal data with geometry column buffer range

    Returns:
        map (MapObject): A visualizable Map Object.
  """

  gps_data = gpd.GeoDataFrame(gps_data, geometry=gpd.points_from_xy(gps_data.longitude,gps_data.latitude),crs='EPSG:4326')  #converting to GeoDataframe with Coordinate Reference system 4326
  map =  folium.Map(location=city_location, tiles='openstreetmap', zoom_start=14)
  for idx, row in gps_data.iterrows():
    Marker([row['latitude'], row['longitude']]).add_to(map)

  bus_terminals = gpd.GeoDataFrame(bus_terminals, geometry=gpd.points_from_xy(bus_terminals.longitude,bus_terminals.latitude),crs='EPSG:4326')
  for idx, row in bus_terminals.iterrows():
    Marker([row['latitude'], row['longitude']]).add_to(map)

  folium.GeoJson(bus_terminals_buffer.to_crs(epsg=4326)).add_to(map)
  map
  return map



# bus_terminals = gpd.GeoDataFrame(bus_terminals, geometry=gpd.points_from_xy(bus_terminals.longitude,bus_terminals.latitude),crs='EPSG:4326')
# bus_terminals = bus_terminals.to_crs('EPSG:5234')
# bus_terminals_buffer = gpd.GeoDataFrame(bus_terminals, geometry = bus_terminals.geometry.buffer(end_buffer))

# gps_data['deviceid'].value_counts()

# data84 = gps_data[gps_data['deviceid']==84]

# city_location = [7.2906,80.6337]  #Kandy city location
# map = map_visualization(data84,city_location,bus_terminals,bus_terminals_buffer)

# map



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [5]:
def filter_all(gps_data,trip_ends):
  """

    To extract all correct data raw point
    from start point to ent point.
    Args:
        gps_data (pd.DataFrame): Cleaned gps data filtered out from the server for the required time window.
        trip_ends (pd.DataFrame): Filtered bus trip data with terminals.

    Returns:
      row_bus_data (pd.DataFrame): all correct data raw point from start point to ent point..
  """
  pointer = 0
  new_gps_data = []
  new_columns = ['id','deviceid','devicetime','latitude','longitude','speed','date','time','trip_id']
  index_num = 0
  for index,row in gps_data.iterrows():
    print(index_num,":",row['deviceid'],":",pointer)
    if datetime.strptime(trip_ends.loc[pointer,'devicetime'], "%Y-%m-%d %H:%M:%S")<=datetime.strptime(row['devicetime'], "%Y-%m-%d %H:%M:%S")<=datetime.strptime(trip_ends.loc[pointer+1,'devicetime'], "%Y-%m-%d %H:%M:%S"):
      new_row = [row['id'],row['deviceid'],row['devicetime'],row['latitude'],row['longitude'],row['speed'],row['date'],row['time'],trip_ends.loc[pointer,'trip_id']]
      new_gps_data.append(new_row)
      # print(row['id'],":",row['id'] == trip_ends.loc[pointer+1,'id']," ",pointer+2)
      if row['id'] == trip_ends.loc[pointer+1,'id']:
        pointer+=2
        if(pointer == len(trip_ends)):
          break
        # print("pointer:",pointer+2)
        # print(trip_ends.loc[pointer,'id'])
    index_num+=1
  new_gps_data = pd.DataFrame(new_gps_data, columns=new_columns)
  return new_gps_data

In [6]:
def get_max_speed_in_trip(gps_data_2,bus_trips):
  """

    To extract all correct data raw point
    from start point to ent point.
    Args:
        gps_data_2 (pd.DataFrame): Cleaned gps data filtered out from the server for the required time window.
        bus_trips (pd.DataFrame): Bus trip terminals data with derived features.

    Returns:
      bus_trip_with_max_speed (pd.DataFrame): Bus trip terminals data with derived features with max speed in each trip
  """
  bus_trips_copy = bus_trips.copy()
  bus_trips_copy.loc[:,'trip_id']
  max_speed_list = []
  for trip_id in bus_trips_copy.loc[:,'trip_id']:
    max_speed = gps_data_2[gps_data_2['trip_id']==trip_id]['speed'].max()
    max_speed_list.append(max_speed)
  bus_trips_copy['max_speed'] = max_speed_list
  return bus_trips_copy

In [7]:
gps_data.head()

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed,date,time
281253,574073556,116,2021-10-15 11:37:14,7.293917,80.736137,0.0,2021-10-15,11:37:14
281400,574078368,116,2021-10-15 11:39:26,7.294845,80.735427,0.0,2021-10-15,11:39:26
281492,574082837,116,2021-10-15 11:39:41,7.294825,80.73547,0.0,2021-10-15,11:39:41
281493,574082838,116,2021-10-15 11:39:56,7.294817,80.735472,0.0,2021-10-15,11:39:56
281494,574082840,116,2021-10-15 11:40:11,7.294813,80.73547,0.0,2021-10-15,11:40:11


In [8]:
# download_csv(trip_ends,'trip_ends')
trip_ends.head()

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed,date,time,geometry,bus_stop,grouped_ends,entry/exit,trip_id
0,574670748,116,2021-10-16 07:08:31,7.299052,80.73441,7.01944,2021-10-16,07:08:31,POINT (195659.523 232979.733),BT02,2,0,1.0
1,574721062,116,2021-10-16 07:53:04,7.29171,80.635112,5.93953,2021-10-16,07:53:04,POINT (184695.391 232169.994),BT01,3,1,1.0
2,574733098,116,2021-10-16 08:03:04,7.293092,80.635573,9.17927,2021-10-16,08:03:04,POINT (184746.416 232322.760),BT01,3,0,2.0
3,574787724,116,2021-10-16 08:53:48,7.299068,80.73435,4.85961,2021-10-16,08:53:48,POINT (195652.887 232981.580),BT02,4,1,2.0
4,574907780,116,2021-10-16 10:50:19,7.298947,80.734155,8.09935,2021-10-16,10:50:19,POINT (195631.367 232968.124),BT02,4,0,3.0


In [9]:
bus_trips.head()
# len(bus_trips)

Unnamed: 0,trip_id,deviceid,date,start_terminal,end_terminal,direction,start_time,end_time,duration,duration_in_mins,day_of_week,hour_of_day
0,1.0,116,2021-10-16,BT02,BT01,2,07:08:31,07:53:04,0:44:33,44.55,5,7
1,2.0,116,2021-10-16,BT01,BT02,1,08:03:04,08:53:48,0:50:44,50.733333,5,8
2,3.0,116,2021-10-16,BT02,BT01,2,10:50:19,11:44:43,0:54:24,54.4,5,10
3,4.0,116,2021-10-16,BT01,BT02,1,12:20:45,13:18:33,0:57:48,57.8,5,12
4,5.0,116,2021-10-16,BT02,BT01,2,14:14:36,15:07:05,0:52:29,52.483333,5,14


In [10]:
gps_data_2 =filter_all(gps_data,trip_ends)
download_csv(gps_data_2,'clearn_gps_data_with_trip_id')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
646794 : 1377 : 3382
646795 : 1377 : 3382
646796 : 1377 : 3382
646797 : 1377 : 3382
646798 : 1377 : 3382
646799 : 1377 : 3382
646800 : 1377 : 3382
646801 : 1377 : 3382
646802 : 1377 : 3382
646803 : 1377 : 3382
646804 : 1377 : 3382
646805 : 1377 : 3382
646806 : 1377 : 3382
646807 : 1377 : 3382
646808 : 1377 : 3382
646809 : 1377 : 3382
646810 : 1377 : 3382
646811 : 1377 : 3382
646812 : 1377 : 3382
646813 : 1377 : 3382
646814 : 1377 : 3382
646815 : 1377 : 3382
646816 : 1377 : 3382
646817 : 1377 : 3382
646818 : 1377 : 3382
646819 : 1377 : 3382
646820 : 1377 : 3382
646821 : 1377 : 3382
646822 : 1377 : 3382
646823 : 1377 : 3382
646824 : 1377 : 3382
646825 : 1377 : 3382
646826 : 1377 : 3382
646827 : 1377 : 3382
646828 : 1377 : 3382
646829 : 1377 : 3382
646830 : 1377 : 3382
646831 : 1377 : 3382
646832 : 1377 : 3382
646833 : 1377 : 3382
646834 : 1377 : 3382
646835 : 1377 : 3382
646836 : 1377 : 3382
646837 : 1377 : 3382
646838 : 13

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
# trip_ends.loc[0,'trip_id'] read trip_id column
gps_data_2.head()

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed,date,time,trip_id
0,574670748,116,2021-10-16 07:08:31,7.299052,80.73441,7.01944,2021-10-16,07:08:31,1.0
1,574670749,116,2021-10-16 07:08:46,7.298598,80.733327,19.4385,2021-10-16,07:08:46,1.0
2,574670750,116,2021-10-16 07:09:01,7.297437,80.732405,26.4579,2021-10-16,07:09:01,1.0
3,574670751,116,2021-10-16 07:09:07,7.297405,80.731912,5.93953,2021-10-16,07:09:07,1.0
4,574670752,116,2021-10-16 07:09:22,7.29742,80.73176,11.3391,2021-10-16,07:09:22,1.0


In [12]:
bus_trip_with_max_speed = get_max_speed_in_trip(gps_data_2,bus_trips)
download_csv(bus_trip_with_max_speed,'bus_trip_with_max_speed')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
bus_trip_with_max_speed.head()

Unnamed: 0,trip_id,deviceid,date,start_terminal,end_terminal,direction,start_time,end_time,duration,duration_in_mins,day_of_week,hour_of_day,max_speed
0,1.0,116,2021-10-16,BT02,BT01,2,07:08:31,07:53:04,0:44:33,44.55,5,7,35.6372
1,2.0,116,2021-10-16,BT01,BT02,1,08:03:04,08:53:48,0:50:44,50.733333,5,8,21.5983
2,3.0,116,2021-10-16,BT02,BT01,2,10:50:19,11:44:43,0:54:24,54.4,5,10,24.2981
3,4.0,116,2021-10-16,BT01,BT02,1,12:20:45,13:18:33,0:57:48,57.8,5,12,22.6782
4,5.0,116,2021-10-16,BT02,BT01,2,14:14:36,15:07:05,0:52:29,52.483333,5,14,31.3175
