In [45]:
#load only taxi_534.txt dataset
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Set the path to the dataset


In [46]:
dataset_path = 'taxi_6211.txt'
# Check if the dataset exists
if not os.path.exists(dataset_path):
    print(f"Dataset {dataset_path} not found.")
    sys.exit(1)

In [47]:
# Load the data
df = pd.read_csv('taxi_6211.txt', sep='\t')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp').reset_index(drop=True)

# For consecutive rows, if latitude and longitude are the same, keep only the first one
df = df.loc[(df['latitude'] != df['latitude'].shift()) | (df['longitude'] != df['longitude'].shift())].reset_index(drop=True)

In [48]:
from geopy.distance import geodesic

def distance_km(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).km

spike_remove = []
threshold_km = 0.5  # adjust as needed for your data (e.g., 0.5 km)

for i in range(1, len(df) - 1):
    prev = df.iloc[i - 1]
    curr = df.iloc[i]
    next_ = df.iloc[i + 1]
    dist_prev_next = distance_km(prev['latitude'], prev['longitude'], next_['latitude'], next_['longitude'])
    dist_prev_curr = distance_km(prev['latitude'], prev['longitude'], curr['latitude'], curr['longitude'])
    dist_curr_next = distance_km(curr['latitude'], curr['longitude'], next_['latitude'], next_['longitude'])
    # If prev and next are close, but curr is far from both, remove curr
    if dist_prev_next < threshold_km and dist_prev_curr > threshold_km and dist_curr_next > threshold_km:
        spike_remove.append(i)

df_spike_cleaned = df.drop(index=spike_remove).reset_index(drop=True)
print(f"Removed {len(spike_remove)} spike rows (sudden jump and return).")
df_spike_cleaned.to_csv('taxi_6211_spike_cleaned.txt', sep='\t', index=False)

Removed 47 spike rows (sudden jump and return).


In [49]:
print(spike_remove)

[443, 453, 2219, 2234, 2379, 3359, 3758, 3839, 4676, 5456, 6519, 6905, 6988, 7952, 9077, 9590, 10208, 10679, 10796, 10889, 11224, 11792, 12235, 12634, 12999, 13774, 15709, 16060, 16101, 16288, 18212, 18295, 19020, 20175, 20210, 21154, 23487, 24931, 25068, 25945, 26978, 27092, 28183, 30207, 30262, 30266, 30944]


In [50]:
#print row 75 from original dataframe
print("Row 75 from original dataframe:")
print(df.iloc[443])
# Print the cleaned dataframe

Row 75 from original dataframe:
taxiId                      6211
timestamp    2008-02-02 14:18:31
longitude              108.94395
latitude                28.46873
Name: 443, dtype: object


In [51]:
def is_possible_move(lat1, lon1, t1, lat2, lon2, t2, max_speed_kmh=200):
    dist_km = geodesic((lat1, lon1), (lat2, lon2)).km
    time_h = abs((t2 - t1).total_seconds()) / 3600
    if time_h == 0:
        return dist_km == 0
    speed = dist_km / time_h
    return speed <= max_speed_kmh



# Check for any remaining impossible moves in the cleaned dataframe
impossible_moves = []
for i in range(1, len(df_spike_cleaned)):
    prev = df_spike_cleaned.iloc[i - 1]
    curr = df_spike_cleaned.iloc[i]
    if not is_possible_move(prev['latitude'], prev['longitude'], prev['timestamp'],
                            curr['latitude'], curr['longitude'], curr['timestamp']):
        impossible_moves.append((i - 1, i))


In [52]:
print(impossible_moves)

[(4602, 4603), (6221, 6222), (9757, 9758), (9758, 9759), (11990, 11991), (11991, 11992), (12069, 12070), (13325, 13326), (17183, 17184), (17184, 17185), (17276, 17277), (17278, 17279), (17280, 17281), (22676, 22677), (22677, 22678), (22698, 22699), (22793, 22794), (22795, 22796), (29693, 29694), (29694, 29695), (29781, 29782), (29783, 29784), (29785, 29786), (30978, 30979), (30979, 30980)]


In [53]:
df_spike_cleaned.iloc[4602]

taxiId                      6211
timestamp    2008-02-03 13:00:12
longitude              116.26726
latitude                40.08606
Name: 4602, dtype: object