In [1]:
import pandas as pd
import numpy as np
import requests
from requests.exceptions import Timeout
import time
import os

In [2]:
os.listdir("../../data/Raw-GPS-data-Kandy-Buses/preprocessed-data-digana_2021_10/")

['bus_stop_all_points.csv',
 'bus_stop_all_points.xlsx',
 'bus_trips.csv',
 'bus_trip_all_points.csv',
 'bus_trip_all_points.xlsx',
 'bus_trip_all_zone_points.csv',
 'bus_trip_all_zone_points.xlsx',
 'bus_trip_with_max_speed.csv',
 'clearn.csv',
 'clearn_gps_data_with_trip_id.csv',
 'trip_ends.csv']

In [10]:
def get_directions(origin, destination, timeout=5, max_retries=3):
    base_url = "https://maps.googleapis.com/maps/api/directions/json"

    params = {
        'origin': f"{origin[0]},{origin[1]}",
        'destination': f"{destination[0]},{destination[1]}",
        'key': "AIzaSyCz5uw3SrNct_Dqw6S6_D6AokxUVp0_hAg",
    }

    for retry in range(max_retries):
        try:
            response = requests.get(base_url, params=params, timeout=timeout)
            response.raise_for_status()  # Raise an HTTPError for bad responses

            # Parse and return the JSON response
            return response.json()
        except Timeout:
            print(f"Timeout error. Retrying... ({retry + 1}/{max_retries})")
        except requests.exceptions.RequestException as e:
            print(f"Error: {e}")

    # If all retries fail, return None or handle as needed
    print(f"Failed after {max_retries} retries. Returning None.")
    return None

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

# this is the initial segmentation component with this component we can segment all the trips into segments
class TripSegmenterByDistance( BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self

  def transform(self, X):
    gps_data_ts, bus_trips_ts = X
    new_records = []  # List to store the split records
    gps_data_ts['d_segment_id'] = None
    segment_id = 1

    total_distance = 16800   # is the distance in meters from Digana to Kandy
    rng = 1000

    start_point =

    for distance in range(rng, total_distance//rng, rng):
      for index, row in bus_trips_ts.iterrows():
        max_index = np.max(np.where(gps_data_ts['trip_id'] == row['trip_id']))
        min_index = np.min(np.where(gps_data_ts['trip_id'] == row['trip_id']))

        margin = min_index

        while(margin < max_index):
          margin, distance = self.binary_search(gps_data_ts, min_index, max_index, 1000);
          # print(f"margin: {margin}, distance: {distance}")
          # break
          gps_data_ts.loc[min_index:margin, 'd_segment_id'] = segment_id

          min_index = margin + 1
          segment_id+=1
    return gps_data_ts


  def binary_search(self, gps_data, starting_ind, max_ind, seg_distance):
    low, high = starting_ind, max_ind
    early_mid, early_distance = None, None
    rounds = 0
    while low<=high:
      if rounds>0:
        early_mid, early_distance = mid, distance
      mid = (low + high)//2
      # take the response from google maps API
      origin = (gps_data.iloc[starting_ind]['latitude'], gps_data.iloc[starting_ind]['longitude'])
      destination = (gps_data.iloc[mid]['latitude'], gps_data.iloc[mid]['longitude'])
      res  = get_directions(origin, destination)
      distance = res['routes'][0]['legs'][0]['distance']['value']
      # print(f"Distance: {distance}   Mid: {mid}")

      if distance>= seg_distance - 20 and distance <= seg_distance + 20:
        # print(res['routes'][0]['legs'][0])
        return mid, distance
      elif distance < seg_distance - 20:
        low = mid + 1
      elif distance > seg_distance + 20:
        high = mid - 1
      rounds+=1
    # print(f"Er dis: {early_distance}  er_mid: {early_mid}")
    if early_mid!=None and abs(seg_distance-early_distance) <= abs(seg_distance-distance):
      return early_mid, early_distance
    else:
      return mid, distance


In [3]:
bus_trips = pd.read_csv("../../data/Raw-GPS-data-Kandy-Buses/preprocessed-data-digana_2021_10/bus_trips.csv")
gps_data = pd.read_csv("../../data/Raw-GPS-data-Kandy-Buses/preprocessed-data-digana_2021_10/clearn_gps_data_with_trip_id.csv")

In [58]:
from sklearn.base import BaseEstimator, TransformerMixin
from geopy.distance import geodesic

# this is the secondary segmentation component which creates the DataFrame of route points along the track
class TripSegmenterByDistance( BaseEstimator, TransformerMixin):

  def __init__(self, month_pointer, path_to_temp, previous_segment_max,path_to_terminals, precision=0.01, seg_pointer = '1000M', ):
    self.month_pointer = month_pointer
    self.path_to_temp = path_to_temp
    self.previous_segment_max = previous_segment_max
    self.seg_distance = int(seg_pointer[:-1])
    self.seg_pointer = seg_pointer
    self.path_to_terminals = path_to_terminals
    self.precision = precision

  def fit(self, X, y=None):
    return self
  
  def calculate_split_df(self, gps_data_ts, bus_trips_ts):
    # load terminals so that starting location and ending location is known
    terminals = pd.read_csv(self.path_to_terminals)

    # load starting point and ending points from the terminals df
    starting_point = (terminals.loc[0]['latitude'], terminals.iloc[0]['longitude'])
    ending_point  = (terminals.loc[1]['latitude'], terminals.iloc[1]['longitude'])

    processing_point = starting_point

    # select trips that are driven to the selected direction
    bus_trips_dir_1 = bus_trips_ts[(bus_trips_ts['start_terminal']==terminals.loc[0]['terminal_id']) & (bus_trips_ts['end_terminal']==terminals.loc[1]['terminal_id'])]

    # splitting points that would identify will store here
    split_points = []

    split_point_id = 1

    # run until complete route is segmented
    # get the distance from currently processing point to ending point and compare with segment length
    while(get_directions(processing_point, ending_point)['routes'][0]['legs'][0]['distance']['value']>self.seg_distance):
      print("Processing: ",split_point_id)
      # assign a ID to the split point
      split_point = {
        'split_point_id': split_point_id
      }
      # iterate through the bus trips df for the selected trips
      for index, row in bus_trips_dir_1.iterrows():
        max_index = np.max(np.where(gps_data_ts['trip_id'] == row['trip_id']))
        min_index = np.min(np.where(gps_data_ts['trip_id'] == row['trip_id']))

        margin, distance = self.binary_search(gps_data_ts,processing_point, min_index, max_index, ending_point)
        print(f"margin: {margin}, distance: {distance}")
        if abs(self.seg_distance - distance)/self.seg_distance <= self.precision:
          split_point['latitude'] = gps_data_ts.iloc[margin]['latitude']
          split_point['longitude'] = gps_data_ts.iloc[margin]['longitude']
          break

      # append the split points to the output list
      split_points.append(split_point)
      
      # increment the id
      split_point_id+=1

      processing_point = (split_point['latitude'], split_point['longitude'])
    
    # take splitting points into a dataframe
    split_points_df = pd.DataFrame(split_points)

    return split_points_df

  def transform(self, X):
    # splits data into gps data and the bus trips
    gps_data_ts, bus_trips_ts = X

    split_points_file_path = "./segment_split_points.csv"
    if os.path.exists(split_points_file_path):
      split_points_df = pd.read_csv(split_points_file_path)
      print("Loaded splitting points from cache")
    else:
      split_points_df = self.calculate_split_df(gps_data_ts, bus_trips_ts)
    
    # initialize segment ids of gps data to None
    gps_data_ts['segment_id'] = np.nan

    # segment_id for the gps_data ans segments df
    segment_id = self.previous_segment_max + 1

    for index, row in bus_trips_ts.iterrows():
      print(f"Processing trip-ID: {index+1} with dir: {row['direction']}")
      # get the maximum and minimum indexes of trip's gps points
      max_index = np.max(np.where(gps_data_ts['trip_id'] == row['trip_id']))
      min_index = np.min(np.where(gps_data_ts['trip_id'] == row['trip_id']))

      # take a copy from split points df to store buffer points
      if row['direction']==1:
        buff = split_points_df[::].copy()
      else:
        buff = split_points_df[::-1].reset_index(drop=True).copy()
    
      # initialize the gps_data_index to nan
      buff['gps_data_index'] = np.nan

      # filling out gps_data_ts's index to split points
      for ind, split_row in buff.iterrows():
        split_point = (split_row['latitude'], split_row['longitude'])
        for i in range(min_index, max_index+1):
          gps_point = (gps_data_ts.iloc[i]['latitude'], gps_data_ts.iloc[i]['longitude'])

          dist = geodesic(split_point, gps_point).meters
          if dist<=100:
            buff.loc[ind,'gps_data_index'] = i
            break
      
      # assign gps_data_ts the segment_ids
      for i in range(0, len(buff)):
        start = min_index if i==0 else buff.loc[i-1]['gps_data_index']
        end = buff.loc[i]['gps_data_index']

        if (not start) or (not end):
          print("-----------------------------------")
        gps_data_ts.loc[start:end,'segment_id'] = segment_id
        segment_id+=1

        if i== len(buff)-1 :
          gps_data_ts.loc[end:max_index,'segment_id'] = segment_id
          segment_id += 1
    
    return gps_data_ts
      
      

  def binary_search(self, gps_data, origin,starting_ind, max_ind, terminal_location):
    # low and high is for binary search since there are needed to calculate mid
    low, high = starting_ind, max_ind

    # early distance and mid is stored since it is useful when returning the optimal mid and distance
    early_mid, early_distance = None, None

    # tracking rounds since to store early mid and early distance, rounds > 0
    rounds = 0

    while low<=high:
      if rounds>0:
        early_mid, early_distance = mid, distance
      mid = (low + high)//2

      # assigning destination
      destination = (gps_data.iloc[mid]['latitude'], gps_data.iloc[mid]['longitude'])

      origin_to_terminal = get_directions(origin, terminal_location)['routes'][0]['legs'][0]['distance']['value']
      destination_to_teminal = get_directions(destination, terminal_location)['routes'][0]['legs'][0]['distance']['value']

      if origin_to_terminal<=destination_to_teminal:
        low = mid+1
        continue

      # calling the API
      res  = get_directions(origin, destination)

      # extract out distance
      distance = res['routes'][0]['legs'][0]['distance']['value']

      if distance>= self.seg_distance - 20 and distance <= self.seg_distance + 20:
        # print(res['routes'][0]['legs'][0])
        return mid, distance
      elif distance < self.seg_distance - 20:
        low = mid + 1
      elif distance > self.seg_distance + 20:
        high = mid - 1
      rounds+=1
    # print(f"Er dis: {early_distance}  er_mid: {early_mid}")
    if early_mid!=None and abs(self.seg_distance-early_distance) <= abs(self.seg_distance-distance):
      return early_mid, early_distance
    else:
      return mid, distance


In [59]:
comp = TripSegmenterByDistance("Dinana_2021_10", "tempPath", 0 , "../../data/Raw-GPS-data-Kandy-Buses/more/bus_terminals_654.csv",0.01, "1000M")
result_df = comp.transform((gps_data, bus_trips))

Loaded splitting points from cache
Processing trip-ID: 1 with dir: 2
-----------------------------------
Processing trip-ID: 2 with dir: 1
Processing trip-ID: 3 with dir: 2
Processing trip-ID: 4 with dir: 1
Processing trip-ID: 5 with dir: 2
Processing trip-ID: 6 with dir: 1
Processing trip-ID: 7 with dir: 2
Processing trip-ID: 8 with dir: 1
Processing trip-ID: 9 with dir: 2
Processing trip-ID: 10 with dir: 1
Processing trip-ID: 11 with dir: 2
Processing trip-ID: 12 with dir: 1
Processing trip-ID: 13 with dir: 2
Processing trip-ID: 14 with dir: 1
Processing trip-ID: 15 with dir: 2
Processing trip-ID: 16 with dir: 1
Processing trip-ID: 17 with dir: 2
Processing trip-ID: 18 with dir: 1
Processing trip-ID: 19 with dir: 2
Processing trip-ID: 20 with dir: 1
Processing trip-ID: 21 with dir: 2
Processing trip-ID: 22 with dir: 1
Processing trip-ID: 23 with dir: 2
Processing trip-ID: 24 with dir: 1
Processing trip-ID: 25 with dir: 2
Processing trip-ID: 26 with dir: 1
Processing trip-ID: 27 with 

In [26]:
result_df[result_df['trip_id']>1706]

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed,date,time,trip_id,segment_id
392612,582673730,1377,2021-10-25 13:20:59,7.298842,80.734025,7.55940,2021-10-25,13:20:59,1707.0,26993.0
392613,582673731,1377,2021-10-25 13:21:14,7.298725,80.733523,9.17927,2021-10-25,13:21:14,1707.0,26993.0
392614,582673732,1377,2021-10-25 13:21:29,7.298230,80.733130,5.93953,2021-10-25,13:21:29,1707.0,26993.0
392615,582673733,1377,2021-10-25 13:21:44,7.297807,80.732997,6.47948,2021-10-25,13:21:44,1707.0,26993.0
392616,582673734,1377,2021-10-25 13:21:59,7.297535,80.732432,9.71923,2021-10-25,13:21:59,1707.0,26993.0
...,...,...,...,...,...,...,...,...,...,...
396733,585350956,1377,2021-10-27 19:02:53,7.290715,80.638227,5.39957,2021-10-27,19:02:53,1725.0,27296.0
396734,585350957,1377,2021-10-27 19:03:08,7.291095,80.637743,8.09935,2021-10-27,19:03:08,1725.0,27296.0
396735,585351802,1377,2021-10-27 19:03:23,7.291273,80.637273,5.39957,2021-10-27,19:03:23,1725.0,27296.0
396736,585351803,1377,2021-10-27 19:03:38,7.291202,80.636218,18.89850,2021-10-27,19:03:38,1725.0,27296.0


In [60]:
result_df.to_csv("./gps_data_segmented_1000M.csv", index = False)

In [44]:
result_df[result_df['segment_id'] == 3665.0]

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed,date,time,trip_id,segment_id
51345,571476016,123,2021-10-12 15:02:30,7.299112,80.734622,5.39957,2021-10-12,15:02:30,230.0,3665.0
51346,571478006,123,2021-10-12 15:05:31,7.295168,80.735662,0.00000,2021-10-12,15:05:31,230.0,3665.0
51347,571478355,123,2021-10-12 15:05:54,7.295817,80.735328,3.23974,2021-10-12,15:05:54,230.0,3665.0
51348,571478356,123,2021-10-12 15:05:55,7.295790,80.735305,5.39957,2021-10-12,15:05:55,230.0,3665.0
51349,571479164,123,2021-10-12 15:06:10,7.295165,80.734663,7.55940,2021-10-12,15:06:10,230.0,3665.0
...,...,...,...,...,...,...,...,...,...,...
51446,571636557,123,2021-10-12 17:33:41,7.282172,80.722955,2.69978,2021-10-12,17:33:41,230.0,3665.0
51447,571636621,123,2021-10-12 17:33:45,7.282228,80.722912,0.00000,2021-10-12,17:33:45,230.0,3665.0
51448,571636716,123,2021-10-12 17:33:50,7.282302,80.722858,3.23974,2021-10-12,17:33:50,230.0,3665.0
51449,571639882,123,2021-10-12 17:34:05,7.283058,80.722237,14.03890,2021-10-12,17:34:05,230.0,3665.0


In [62]:
groupby_res = result_df.groupby('segment_id').count()
groupby_res[groupby_res['id']>100]

Unnamed: 0_level_0,id,deviceid,devicetime,latitude,longitude,speed,date,time,trip_id
segment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3792.0,128,128,128,128,128,128,128,128,128
3795.0,333,333,333,333,333,333,333,333,333
3894.0,106,106,106,106,106,106,106,106,106
3897.0,273,273,273,273,273,273,273,273,273
3999.0,338,338,338,338,338,338,338,338,338
...,...,...,...,...,...,...,...,...,...
21723.0,108,108,108,108,108,108,108,108,108
23325.0,154,154,154,154,154,154,154,154,154
23328.0,119,119,119,119,119,119,119,119,119
24566.0,179,179,179,179,179,179,179,179,179


In [63]:
result_df[result_df['segment_id'] == 23328.0]

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed,date,time,trip_id,segment_id
346754,560361273,1250,2021-10-01 08:05:37,7.290413,80.639330,11.33910,2021-10-01,08:05:37,1498.0,25143.0
346755,560361276,1250,2021-10-01 08:05:52,7.290585,80.638353,16.73870,2021-10-01,08:05:52,1498.0,25143.0
346756,560367996,1250,2021-10-01 08:06:07,7.291150,80.637575,7.01944,2021-10-01,08:06:07,1498.0,25143.0
346757,560367997,1250,2021-10-01 08:06:22,7.291328,80.636963,13.49890,2021-10-01,08:06:22,1498.0,25143.0
346758,560367998,1250,2021-10-01 08:06:37,7.291303,80.636090,11.87910,2021-10-01,08:06:37,1498.0,25143.0
...,...,...,...,...,...,...,...,...,...,...
346973,560437022,1358,2021-10-01 08:44:39,7.290845,80.638165,10.79910,2021-10-01,08:44:39,1498.0,25143.0
346974,560444210,1358,2021-10-01 08:44:54,7.291197,80.637432,8.63931,2021-10-01,08:44:54,1498.0,25143.0
346975,560444211,1358,2021-10-01 08:45:09,7.291322,80.636848,15.11880,2021-10-01,08:45:09,1498.0,25143.0
346976,560444212,1358,2021-10-01 08:45:24,7.291487,80.635713,17.27860,2021-10-01,08:45:24,1498.0,25143.0


In [71]:
import folium
df = pd.read_csv("./segment_split_points.csv")
segment = result_df[(result_df['segment_id'] == 17) | (result_df['segment_id'] == 16)]

# Create a folium map centered around the mean of latitudes and longitudes
map_center = [df['latitude'].mean(), df['longitude'].mean()]
mymap = folium.Map(location=map_center, zoom_start=14)

# Add markers for each row in the DataFrame
for index, row in df.iterrows():
    folium.Marker(location=[row['latitude'], row['longitude']]).add_to(mymap)

# Add markers for each row in the DataFrame
for index, row in segment.iterrows():
    folium.Marker(location=[row['latitude'], row['longitude']], icon=folium.Icon(color="green")).add_to(mymap)

# Save the map as an HTML file
mymap.save("map.html")


In [61]:
result_df.to_csv("./segment_split_points.csv", index=False)

In [None]:
import datetime
parsed_date = "2023-12-05 10:00:00"
start_time = pd.Timestamp(parsed_date)

# Generate a range of 3-minute intervals for the current record
time_intervals = pd.date_range(start=start_time, periods=10, freq='3T')
print(time_intervals)

DatetimeIndex(['2023-12-05 10:00:00', '2023-12-05 10:03:00',
               '2023-12-05 10:06:00', '2023-12-05 10:09:00',
               '2023-12-05 10:12:00', '2023-12-05 10:15:00',
               '2023-12-05 10:18:00', '2023-12-05 10:21:00',
               '2023-12-05 10:24:00', '2023-12-05 10:27:00'],
              dtype='datetime64[ns]', freq='3T')


In [78]:
for index, row in bus_trips_ts.iterrows():
      print(f"Processing trip-ID: {index+1}")
      # get the maximum and minimum indexes of trip's gps points
      max_index = np.max(np.where(gps_data_ts['trip_id'] == row['trip_id']))
      min_index = np.min(np.where(gps_data_ts['trip_id'] == row['trip_id']))

      # intialize lower pointer so I can use iloc with range
      lower_ptr = min_index

      # this is the pointer that points to splitting point
      split_points_ptr = 0

      prev_dist = None
      consec = 0
      # loop through entire trip's gps points
      for ind in range(min_index, max_index+1):

        if len(split_points_df) == split_points_ptr:
          gps_data_ts.loc[lower_ptr:max_index, "segment_id"] = segment_id
          segment_id +=1
          break
        gps_cordinate = (gps_data_ts.iloc[ind]['latitude'], gps_data_ts.iloc[ind]['longitude'])
        split_point_cordinate = (split_points_df.iloc[split_points_ptr]['latitude'], split_points_df.iloc[split_points_ptr]['longitude'])
        dist = geodesic(gps_cordinate, split_point_cordinate).meters

        if prev_dist and dist> prev_dist:
          consec+=1
          if consec>3:
            print("Something has missing")

        if dist<=100:
          gps_data_ts.loc[lower_ptr:ind, "segment_id"] = segment_id
          segment_id +=1
          lower_ptr = ind + 1
          split_points_ptr += 1

          # aditional consec thing
          prev_dist = None
          consec = 0

        prev_dist = dist
    return gps_data_ts


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 42)