In [1]:
import pandas as pd
import os
from matplotlib import pyplot as plt
import glob
import numpy as np
from tqdm import tqdm
from datetime import datetime, timezone, timedelta
import math

In [2]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0  # Radius of the Earth in kilometers

    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)

    a = (math.sin(dlat / 2) ** 2 +
         math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
         math.sin(dlon / 2) ** 2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    distance = R * c
    return distance

In [None]:
data_path = './../data_cabspotting/'

file_paths = glob.glob(data_path + './data_raw/new_*.txt')

dfs = []
for file_path in tqdm(file_paths):
    if os.path.exists(file_path):
        df_temp = pd.read_table(file_path, sep=" ", header=None)
        
        df_temp[3] = df_temp[3].apply(lambda x: datetime.fromtimestamp(x, tz=timezone.utc).astimezone(timezone(timedelta(hours=-8))))

        df_temp = df_temp.sort_values(by=3)
        df_temp['prev_latitude'] = df_temp[0].shift(1)
        df_temp['prev_longitude'] = df_temp[1].shift(1)

        df_temp['distance_km'] = df_temp.apply(lambda row: haversine(row[0], row[1], 
                                                        row['prev_latitude'], row['prev_longitude']), axis=1)
        df_temp['prev_time'] = df_temp[3].shift(1)
        df_temp['time_diff'] = (df_temp[3] - df_temp['prev_time']).dt.seconds

        df_temp['flag_transition_time'] = np.where((df_temp['time_diff']>100) | (df_temp['distance_km']>2.5), 1, 0)
        df_temp['flag_transition_bool'] = df_temp[2].diff().abs()

        df_temp['flag_transition'] = np.where(df_temp['flag_transition_time'] + df_temp['flag_transition_bool'] > 0.5, 1, 0)
        df_temp['trip_id'] = (df_temp['flag_transition'] == 1).cumsum()

        df_temp = df_temp.drop(
            columns=['prev_latitude', 'prev_longitude', 
                     'prev_time', 'flag_transition_time', 
                     'flag_transition_bool', 'flag_transition'])

        df_temp.fillna(0, inplace=True)
        
        file_name = os.path.basename(file_path).replace('new_', '').replace('.txt', '')
        df_temp['driver'] = file_name
        
        output_path = os.path.join(data_path, f"processed_{file_name}.csv")
        df_temp.to_csv(output_path, index=False)

 24%|██▍       | 131/536 [01:13<03:47,  1.78it/s]