In [87]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [88]:
!pip install geopandas



In [89]:
from datetime import datetime, timedelta
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime,date
import geopandas as gpd
from geopandas import GeoDataFrame as gdf
from google.colab import files
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster
import glob
import os

In [90]:
file_label = '_2022_10'
path_raw_data = '/content/drive/MyDrive/Data_sets/Raw-GPS-data-Kandy-Buses/digana'+file_label+'.csv'
path_bus_terminals = '/content/drive/MyDrive/Data_sets/Raw-GPS-data-Kandy-Buses/more/bus_terminals_654.csv'

# path_trip_ends = '/content/drive/MyDrive/Data_sets/Raw-GPS-data-Kandy-Buses/preprocessed-data-digana_2021_10/trip_ends.csv'
# path_bus_trips = '/content/drive/MyDrive/Data_sets/Raw-GPS-data-Kandy-Buses/preprocessed-data-digana_2021_10/bus_trips.csv'
path_bus_stops = '/content/drive/MyDrive/Data_sets/Raw-GPS-data-Kandy-Buses/more/bus_stops_654.csv'

In [91]:
def download_csv(data,filename):

  """
    To download output as CSV files

    Args:
        data (pd.DataFrame): DataFrame Object.
        filename (str): Name of the file has to be faved.

    Returns:
        None
  """

  filename= filename + file_label + '.csv'
  data.to_csv(filename, encoding = 'utf-8-sig',index= False)
  files.download(filename)

def get_data_from_drive(path):

  """
    Get csv file from given file path.

    Args:
        path (str): Location for the file.

    Returns:
        data (pd.DataFrame): A DataFrame Object of given file path.
    """

  data = pd.read_csv(path)
  return data

In [92]:
# raw_data = pd.read_csv(path_raw_data)
raw_data = get_data_from_drive(path_raw_data)
# bus_terminals= pd.read_csv(path_bus_terminals)
bus_terminals = get_data_from_drive(path_bus_terminals)
# bus_stops= pd.read_csv(path_bus_stops)
bus_stops = get_data_from_drive(path_bus_stops)

In [93]:
def raw_data_cleaning(raw_data):

  """
    Removal of records with error records.
    Remove data with zero values for longitude and latitude columns.
    Sort data by time and device.

    Args:
        raw_data (pd.DataFrame): Crude raw GPS data filtered out from the server for the required time window.

    Returns:
        gps_data (pd.DataFrame): A cleaned dataframe object of GPS data.
    """

  #raw_data = raw_data.drop(drop_columns, axis = 1)

  gps_data = raw_data[raw_data.latitude != 0]
  gps_data = gps_data[gps_data.longitude != 0] #cleaning zero values for latitude & longitude

  gps_data['date'] = pd.to_datetime(gps_data['devicetime']).dt.date #split date and time separately into datetime variables
  gps_data['time'] = pd.to_datetime(gps_data['devicetime']).dt.time

  gps_data = gps_data.sort_values(['deviceid', 'date', 'time']) #sorting dataset by time and device

  return gps_data

#The additional unwanted columns from the dataset are found to be deleted(Optional Step)
# additional_columns = ['servertime','fixtime','address','routeid']

gps_data= raw_data_cleaning(raw_data)
# Remove columns
# gps_data=gps_data.drop(additional_columns, axis=1)

In [94]:
def trip_ends(gps_data,bus_terminals,end_buffer):

  """
    To extract trip ends dataframe with given buffer range.
    Filter the records within terminals selected buffer range.
    Within the filtered records get entry & exit to terminals.


    Args:
        gps_data (pd.DataFrame): Cleaned gps data filtered out from the server for the required time window.
        bus_terminals (pd.DataFrame): End and start terminals for the trip.
        end_buffer (int):  Radius of the buffer area to represent terminals.

    Returns:
        trip_ends (pd.DataFrame): Trip data with extracted terminals.
  """

  #converting to GeoDataframe with Coordinate Reference system 4326
  gps_data = gpd.GeoDataFrame(gps_data, geometry=gpd.points_from_xy(gps_data.longitude,gps_data.latitude),crs='EPSG:4326')
  bus_terminals = gpd.GeoDataFrame(bus_terminals, geometry=gpd.points_from_xy(bus_terminals.longitude,bus_terminals.latitude),crs='EPSG:4326')

  #project them in local cordinate system
  gps_data = gps_data.to_crs('EPSG:5234')
  bus_terminals = bus_terminals.to_crs('EPSG:5234')

  #creating buffer area to extract records around bus terminals
  bus_terminals_buffer = gpd.GeoDataFrame(bus_terminals, geometry = bus_terminals.geometry.buffer(end_buffer))

  #filtering coordinates within bus terminals end buffer
  gps_data['bus_stop'] = pd.Series(dtype='object') #create a new column in gps data set
  gps_data.reset_index(drop = True, inplace = True) #reset indices to run a for loop


  for i in range(len(gps_data)):
    for stop in range(len(bus_terminals)):
      if bus_terminals_buffer.iloc[stop].geometry.contains(gps_data.iloc[i].geometry):
        gps_data.at[i,'bus_stop'] = bus_terminals.at[stop,'terminal_id']

  # Drop the rows where at least one element is missing(droping the middile points)
  trip_ends = gps_data.dropna() #filter records within terminal buffer

  # #EXTRACT TRIP ENDS

  #1)TODO
  #grouping the filtered records of one bus terminal and one date
  trip_ends['grouped_ends'] = ((trip_ends['bus_stop'].shift() != trip_ends['bus_stop']) | (trip_ends['date'].shift() != trip_ends['date'])).cumsum()

  #find the entry or exit record only of the terminals
  #Early records is the entry(1) to the terminal and last record as the exit(0) to the end terminal
  trip_ends['entry/exit'] = pd.Series(dtype='object')
  trip_ends = trip_ends.reset_index(drop=True)

  # 2)TODO
  for name, group in trip_ends.groupby('grouped_ends'):
    #if 0 in group['speed'].values:
    for index, row in group.iterrows():
      if row['devicetime'] == group['devicetime'].max():
        trip_ends.at[index,'entry/exit'] = '0'
      elif row['devicetime'] == group['devicetime'].min():
        trip_ends.at[index,'entry/exit'] = '1'

  trip_ends = trip_ends.dropna() #filter terminal entry/exit records only

  trip_ends = trip_ends.reset_index(drop=True)

  #Providing unique trip id for trips which have entry / exit values within the 2 bus end terminals
  trip = 0
  for i in range(len(trip_ends)-1):
    if (trip_ends.at[i,'bus_stop'] != trip_ends.at[i+1,'bus_stop']) & (trip_ends.at[i,'date'] == trip_ends.at[i+1,'date']):
      trip= trip+1
      trip_ends.at[i,'trip_id'] = trip
      trip_ends.at[i+1,'trip_id'] = trip

  trip_ends = trip_ends.dropna()

  #3)TODO
  trip_ends = trip_ends.groupby('trip_id').filter(lambda x : len(x)>1)    #remove outliers where no defined 2 trip ends for a trip
  trip_ends = trip_ends.reset_index(drop=True)

  return trip_ends

end_buffer = 100
trip_ends = trip_ends(gps_data,bus_terminals,end_buffer)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [95]:
def trip_extraction(trip_ends):

  """
    To extract bus trips with derived columns.
    Create end_time, end_terminal for a bus trip.
    Create features of duration, duration_in_mins, day_of_the_week, hour_of_day

    Args:
        trip_ends (pd.DataFrame): Filtered bus trip data with terminals.

    Returns:
        bus_trips (pd.DataFrame): Bus trip terminals data with derived features.
  """

  bus_trips = trip_ends.copy()
  bus_trips[['end_time','end_terminal']] = bus_trips[['time','bus_stop']].shift(-1)
  bus_trips = bus_trips.iloc[::2]

  bus_trips = bus_trips.drop(['id','devicetime','latitude','longitude','speed','geometry','grouped_ends','entry/exit'],axis=1)
  bus_trips.insert(0,'trip_id',bus_trips.pop('trip_id'))
  bus_trips.rename(columns = {'time':'start_time','bus_stop': 'start_terminal'}, inplace =True)

  conditions = [(bus_trips['start_terminal'] == 'BT01'),
              (bus_trips['start_terminal'] == 'BT02')]
  values = [1,2]

  bus_trips['direction'] = np.select(conditions, values)

  bus_trips = bus_trips[['trip_id','deviceid','date','start_terminal','end_terminal','direction','start_time','end_time']]
  bus_trips=bus_trips.reset_index(drop = True)

  #Calculate trip duration
  bus_trips['duration'] = pd.Series(dtype='object')
  for i in range(len(bus_trips)):
    bus_trips.at[i,'duration'] = datetime.combine(date.min,bus_trips.at[i,'end_time']) - datetime.combine(date.min,bus_trips.at[i,'start_time'])

  bus_trips['duration_in_mins'] = bus_trips['duration']/np.timedelta64(1,'m')

  bus_trips['day_of_week'] = pd.to_datetime(bus_trips['date']).dt.weekday
  bus_trips['hour_of_day'] = list(map(lambda  x: x.hour, (bus_trips['start_time'])))

  return bus_trips

bus_trips = trip_extraction(trip_ends)

In [96]:
#developing geo-buffer rings around every bus stops
def bus_stop_buffer_create(gps_data,bus_stops,stop_buffer,extra_buffer):

  """

    Buffer and additional buffer  created  to accomodate points if they were missed in standard stop buffer.

    Args:
        gps_data (pd.DataFrame): Cleaned gps data filtered out from the server for the required time window.
        bus_stops (pd.DataFrame) : Bus stops data for the trip route
        stop_buffer (int):  Radius of the buffer area to represent bus stops
        extra_buffer (int):  Extended radius of the buffer area to represent bus stops.

    Returns:
        bus_stops_buffer1 (GeoDataFrame) : Buffer created for filtered  Kandy-Digana direction.
        bus_stops_buffer2 (GeoDataFrame) : Buffer created for filtered  Digana-Kandy direction
        gps_data (GeoDataFrame) :  GPS data as GeoDataFrame with projected corrdinates.
        bus_stops_buffer1_add (GeoDataFrame) : Additional buffer created for filtered  Kandy-Digana direction.
        bus_stops_buffer2_add (GeoDataFrame) : Additional buffer created for filtered  Digana-Kandy direction.
  """

  #Create Geodataframe of GPS data and bus stops data
  gps_data = gpd.GeoDataFrame(gps_data, geometry=gpd.points_from_xy(gps_data.longitude,gps_data.latitude),crs='EPSG:4326')
  bus_stops = gpd.GeoDataFrame(bus_stops, geometry=gpd.points_from_xy(bus_stops.longitude,bus_stops.latitude),crs='EPSG:4326')

  #project the corrdinates in Local coordinate system
  bus_stops = bus_stops.to_crs('EPSG:5234')
  gps_data = gps_data.to_crs('EPSG:5234')

  #split bus stops dataframe into two based on route direction
  bus_stops_direction1 = bus_stops[bus_stops['direction']=='Kandy-Digana']
  bus_stops_direction2 = bus_stops[bus_stops['direction']=='Digana-Kandy']

  bus_stops_direction2.reset_index(drop = True, inplace = True)

  #proximity analysis
  #creating a buffer
  bus_stops_buffer1 = gpd.GeoDataFrame(bus_stops_direction1, geometry = bus_stops_direction1.geometry.buffer(stop_buffer))
  bus_stops_buffer2 = gpd.GeoDataFrame(bus_stops_direction2, geometry = bus_stops_direction2.geometry.buffer(stop_buffer))

  #creating additional extra buffer to accomodate points if they were missed in standard stop buffer
  bus_stops_buffer1_add = gpd.GeoDataFrame(bus_stops_direction1, geometry = bus_stops_direction1.geometry.buffer(extra_buffer))
  bus_stops_buffer2_add = gpd.GeoDataFrame(bus_stops_direction2, geometry = bus_stops_direction2.geometry.buffer(extra_buffer))

  return bus_stops_buffer1, bus_stops_buffer2,gps_data,bus_stops_buffer1_add,bus_stops_buffer2_add

stop_buffer = 50
extra_buffer = 100
bus_stops_buffer1, bus_stops_buffer2,gps_data,bus_stops_buffer1_add,bus_stops_buffer2_add = bus_stop_buffer_create(gps_data,bus_stops,stop_buffer,extra_buffer)

#splitting trajectories
def bus_trajectory(gps_data,trip_ends,bus_trips):

  """
    Create bus trajectory data of sequence of bus stops with direction of trip.

    Args:
        gps_data (GeoDataFrame): Bus trips GPS data
        trip_ends (pd.DataFrame) : Splitted trip data from bus_trip_extraction.py
        bus_trips (pd.DataFrame) : Bus trips data

    Returns:
        bus_trajectory (pd.DataFrame): Sequence of bus trip trajectory data
  """

  #gps records that are matched with end terminals, are merged with whole GPS records
  trip_ends = trip_ends[['id','bus_stop','trip_id']]
  bus_trajectory = pd.merge(left = gps_data, right  = trip_ends,how = 'outer',left_on ='id', right_on= 'id')

  #gps records that are not associated with the terminals are asssigned as trip id = 0
  bus_trajectory["trip_id"].fillna(0, inplace = True)

  #run a loop to assign trip_id to records that are in between the terminals
  bus_trajectory.reset_index(drop = True, inplace = True)

  trip =1
  for i in range(len(bus_trajectory)-1):
    if (bus_trajectory.at[i,'trip_id']==trip) & (bus_trajectory.at[i+1, 'trip_id'] == 0):
      bus_trajectory.at[i+1,'trip_id'] = trip
    elif (bus_trajectory.at[i,'trip_id']==trip) & (bus_trajectory.at[i+1, 'trip_id'] == trip):
      trip = trip + 1

  bus_trajectory.drop(bus_trajectory[bus_trajectory['trip_id']==0].index, inplace = True ) #drop records that are not identified as a bus trip

  #Identify the directions of each bus trajectories using bus trips extracted data
  directions= bus_trips.set_index('trip_id').to_dict()['direction']
  bus_trajectory['direction'] = list(map(lambda x: directions[x]   ,bus_trajectory['trip_id']))

  return bus_trajectory

bus_trajectory = bus_trajectory(gps_data,trip_ends,bus_trips)



def stop_buffer_filter(bus_trajectory,bus_stops_buffer1,bus_stops_buffer2,bus_stops_buffer1_add,bus_stops_buffer2_add):


  """

    Filter bus trip data of two buffer ranges with all the bus points, only bus stops points.

    Args:
        bus_trajectory (pd.DataFrame): Sequence of bus trip trajectory data
        bus_stops_buffer1 (GeoDataFrame) : Buffer created for filtered  Kandy-Digana direction.
        bus_stops_buffer2 (GeoDataFrame) : Buffer created for filtered  Digana-Kandy direction
        bus_stops_buffer1_add (GeoDataFrame) : Additional buffer created for filtered  Kandy-Digana direction.
        bus_stops_buffer2_add (GeoDataFrame) : Additional buffer created for filtered  Digana-Kandy direction.

    Returns:
        bus_trip_all_points (pd.DataFrame): Bus trip data with all points including null for bus_stop
        bus_stop_all_points (pd.DataFrame): Bus trip data with only bus_stops points

  """

  #project to local coordinate system before buffer filtering
  bus_trajectory = bus_trajectory.to_crs('EPSG:5234')

  #split trajectories by direction
  trajectory_dir_1 = bus_trajectory[bus_trajectory['direction'] == 1]
  trajectory_dir_2 = bus_trajectory[bus_trajectory['direction'] == 2]

  #reset index before for loop
  trajectory_dir_1.reset_index(drop = True, inplace = True)
  trajectory_dir_2.reset_index(drop = True, inplace = True)

  #filter records within bus stops buffer of both directions
  for i in range(len(trajectory_dir_1)):
    for stop in range(len(bus_stops_buffer1)):
      if bus_stops_buffer1.iloc[stop].geometry.contains(trajectory_dir_1.iloc[i].geometry):
        trajectory_dir_1.at[i,'bus_stop'] = bus_stops_buffer1.at[stop,'stop_id']
      else:
        if bus_stops_buffer1_add.iloc[stop].geometry.contains(trajectory_dir_1.iloc[i].geometry):
          trajectory_dir_1.at[i,'bus_stop'] = bus_stops_buffer1_add.at[stop,'stop_id']

  for i in range(len(trajectory_dir_2)):
    for stop in range(len(bus_stops_buffer2)):
      if bus_stops_buffer2.iloc[stop].geometry.contains(trajectory_dir_2.iloc[i].geometry):
        trajectory_dir_2.at[i,'bus_stop'] = bus_stops_buffer2.at[stop,'stop_id']
      else:
        if bus_stops_buffer2_add.iloc[stop].geometry.contains(trajectory_dir_2.iloc[i].geometry):
          trajectory_dir_2.at[i,'bus_stop'] = bus_stops_buffer2_add.at[stop,'stop_id']
  #concatenate dataframes of both directions and keep only records filtered within bus stops
  bus_trip_all_points = pd.concat([trajectory_dir_1,trajectory_dir_2])
  bus_stop_all_points = bus_trip_all_points.dropna()

  return bus_trip_all_points , bus_stop_all_points

bus_trip_all_points, bus_stop_all_points = stop_buffer_filter(bus_trajectory,bus_stops_buffer1,bus_stops_buffer2,bus_stops_buffer1_add,bus_stops_buffer2_add)

In [97]:
def define_zone(bus_trip_all_points,bus_stop_all_points):
  trip_id_list = bus_trip_all_points['trip_id'].unique().tolist()
  bus_trip_all_points = bus_trip_all_points.reset_index(drop=True)
  bus_trip_all_points_copy =bus_trip_all_points.copy()
  bus_trip_all_points_copy['zone']=bus_trip_all_points_copy['bus_stop'].copy()
  for trip_id in trip_id_list:
    trip = bus_trip_all_points_copy[bus_trip_all_points_copy['trip_id']==trip_id].copy()
    # trip.reset_index(drop = True, inplace = True)
    stops = bus_stop_all_points[bus_stop_all_points['trip_id']==trip_id].copy()
    # stops.reset_index(drop = True, inplace = True)
    # print(trip.iloc[0]['direction'])
    if(trip.iloc[0]['direction']==1):
      trip['bus_stop']=trip['bus_stop'].replace("BT01",'100')
      trip['bus_stop']= trip['bus_stop'].replace("BT02",'115')
      stops['bus_stop']=stops['bus_stop'].replace("BT01",'100')
      stops['bus_stop']= stops['bus_stop'].replace("BT02",'115')
    else:
      trip['bus_stop']=trip['bus_stop'].replace("BT02",'200')
      trip['bus_stop']= trip['bus_stop'].replace("BT01",'214')
      stops['bus_stop']=stops['bus_stop'].replace("BT02",'200')
      stops['bus_stop']= stops['bus_stop'].replace("BT01",'214')

    # print(trip_id)

    stops_list = stops['bus_stop'].unique()
    for i in range(len(stops_list)):
      if(i!=len(stops_list)-1):
        start = trip[trip['bus_stop']==stops_list[i]].index.max()+1
        end = trip[trip['bus_stop']==stops_list[i+1]].index.min()-1
        bus_trip_all_points_copy.loc[start:end,'zone']=float(stops_list[i])+0.5
        # print(trip.iloc[start:end,5].tolist())
        # data[float(stops_list[i])+0.5].append(trip.iloc[start:end,5].mean())
  bus_trip_all_points_copy.loc[(bus_trip_all_points_copy['direction']==1)&(bus_trip_all_points_copy['zone']=='BT01'),'zone']='100'
  bus_trip_all_points_copy.loc[(bus_trip_all_points_copy['direction']==1)&(bus_trip_all_points_copy['zone']=='BT02'),'zone']='115'
  bus_trip_all_points_copy.loc[(bus_trip_all_points_copy['direction']==2)&(bus_trip_all_points_copy['zone']=='BT01'),'zone']='214'
  bus_trip_all_points_copy.loc[(bus_trip_all_points_copy['direction']==2)&(bus_trip_all_points_copy['zone']=='BT02'),'zone']='200'
  return bus_trip_all_points_copy
bus_trip_zone_wise_points = define_zone(bus_trip_all_points,bus_stop_all_points)

In [98]:
# bus_trip_all_points = pd.read_csv('/content/drive/MyDrive/Data_sets/Raw-GPS-data-Kandy-Buses/preprocessed-data-digana/2022_01/bus_trip_all_points_2022_01.csv')
# bus_stop_all_points = pd.read_csv('/content/drive/MyDrive/Data_sets/Raw-GPS-data-Kandy-Buses/preprocessed-data-digana/2022_01/bus_stop_all_points_2022_01.csv')

In [99]:
bus_trip_zone_wise_points

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed,date,time,geometry,bus_stop,trip_id,direction,zone
0,1454563267,116,2022-10-01 09:50:21,7.293147,80.635247,4.31966,2022-10-01,09:50:21,POINT (184710.345 232328.853),BT01,2.0,1,100
1,1454563270,116,2022-10-01 09:50:36,7.293152,80.635695,7.01944,2022-10-01,09:50:36,POINT (184759.855 232329.391),,2.0,1,100.5
2,1454563271,116,2022-10-01 09:50:51,7.293175,80.635822,0.00000,2022-10-01,09:50:51,POINT (184773.835 232331.974),,2.0,1,100.5
3,1454569242,116,2022-10-01 09:51:06,7.293093,80.635853,0.00000,2022-10-01,09:51:06,POINT (184777.332 232322.938),,2.0,1,100.5
4,1454569246,116,2022-10-01 09:51:21,7.293085,80.636310,8.09935,2022-10-01,09:51:21,POINT (184827.759 232322.005),,2.0,1,100.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
554020,1549812956,1377,2022-11-01 18:32:10,7.290812,80.638093,11.87910,2022-11-01,18:32:10,POINT (185024.587 232070.549),,1973.0,2,213.5
554021,1549812957,1377,2022-11-01 18:32:25,7.291148,80.637472,9.71923,2022-11-01,18:32:25,POINT (184955.953 232107.802),,1973.0,2,213.5
554022,1549812958,1377,2022-11-01 18:32:40,7.291362,80.636612,17.81860,2022-11-01,18:32:40,POINT (184861.003 232131.418),,1973.0,2,213.5
554023,1549812959,1377,2022-11-01 18:32:55,7.291580,80.635345,15.65880,2022-11-01,18:32:55,POINT (184721.158 232155.611),,1973.0,2,213.5


In [100]:
download_csv(trip_ends,'trip_ends')
download_csv(bus_trips,'bus_trips')
download_csv(bus_trip_all_points,'bus_trip_all_points')
download_csv(bus_stop_all_points,'bus_stop_all_points')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [101]:
download_csv(bus_trip_zone_wise_points,'bus_trip_zone_wise_points')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>