In [27]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [28]:
def find_conflicting_coordinates(pickle_file_path):
    """
    Reads data from a pickle file and checks if for the same taxi_id and timestamp,
    there are different coordinates.
    
    :param pickle_file_path: Path to the pickle file
    :return: DataFrame with conflicting rows, or an empty DataFrame if none found
    """
    # Load data
    df = pd.read_pickle(pickle_file_path)

    # Check required columns
    required_cols = {'taxi_id', 'latitude', 'longitude', 'timestamp'}
    if not required_cols.issubset(df.columns):
        raise ValueError(f"Missing one of required columns: {required_cols}")

    # Group by taxi_id and timestamp
    grouped = df.groupby(['taxi_id', 'timestamp'])

    # Find groups with more than one unique (lat, lon) pair
    conflicts = grouped.agg({
        'latitude': pd.Series.nunique,
        'longitude': pd.Series.nunique
    }).reset_index()

    # Keep groups with multiple unique coordinates
    conflicting_keys = conflicts[
        (conflicts['latitude'] > 1) | (conflicts['longitude'] > 1)
    ][['taxi_id', 'timestamp']]

    # Join back to original data to show all conflicting rows
    if not conflicting_keys.empty:
        merged = df.merge(conflicting_keys, on=['taxi_id', 'timestamp'], how='inner')
        return merged
    else:
        return pd.DataFrame(columns=df.columns)


In [2]:
conflicts = find_conflicting_coordinates("merged_cleaned_data.pkl")
if conflicts.empty:
    print("No conflicting coordinates found.")


In [3]:
conflicts.head()

Unnamed: 0,taxi_id,timestamp,longitude,latitude,trip_status
0,3741,2008-02-02 13:34:50,116.39968,39.97224,active
1,3741,2008-02-02 13:34:50,116.34231,39.97224,active
2,1957,2008-02-02 13:37:45,117.02122,40.35947,active
3,1957,2008-02-02 13:37:45,117.02121,40.35947,active
4,1824,2008-02-02 13:38:28,116.30483,39.88109,active


In [5]:
conflicts.shape

(3851, 5)

In [7]:
conflicted_taxi_ids = conflicts['taxi_id'].nunique()
print(f"Number of conflicted taxi IDs: {conflicted_taxi_ids}")

Number of conflicted taxi IDs: 889


In [9]:
conflict_counts_pertaxi = conflicts.groupby('taxi_id').size().reset_index(name='conflict_row_count')

In [11]:
conflict_counts_pertaxi.conflict_row_count.max()

np.int64(130)

In [15]:
conflict_counts_pertaxi[conflict_counts_pertaxi.conflict_row_count == 130].taxi_id

526    3557
Name: taxi_id, dtype: int64

In [20]:
conflict_counts = conflicts.groupby('taxi_id').size().reset_index(name='conflict_row_count')

timestamp
2008-02-03 11:46:19    2
2008-02-03 12:37:02    2
2008-02-03 17:42:01    2
2008-02-03 18:01:39    2
2008-02-03 19:12:28    2
                      ..
2008-02-08 13:26:05    2
2008-02-08 14:03:38    2
2008-02-08 17:00:05    2
2008-02-08 17:00:19    2
2008-02-08 17:10:52    2
Name: taxi_id, Length: 65, dtype: int64

In [21]:
conflict_counts_pertime = conflicts.groupby(['taxi_id','timestamp']).size().reset_index(name='conflict_row_count')

In [23]:
conflict_counts_pertime.conflict_row_count.max()

np.int64(3)

In [29]:
# Sort for consistent ordering
conflicts_df = conflicts.sort_values(['taxi_id', 'timestamp'])

# Group by both taxi_id and timestamp
def compute_diffs(group):
    if len(group) < 2:
        return None
    row1, row2 = group.iloc[0], group.iloc[1]
    return pd.Series({
        'lat_diff': abs(row1['latitude'] - row2['latitude']),
        'lon_diff': abs(row1['longitude'] - row2['longitude']),
        'taxi_id': row1['taxi_id'],
        'timestamp': row1['timestamp']
    })

coord_diffs = conflicts_df.groupby(['taxi_id', 'timestamp']).apply(compute_diffs).dropna().reset_index(drop=True)

In [34]:
print("Maximum latitude difference:",round(coord_diffs.lat_diff.max(),3))
print("Average latitude difference:",round(coord_diffs.lat_diff.mean(),3))
print("Minimum latitude difference:",round(coord_diffs.lat_diff.min(),3))

Maximum latitude difference: 40.071
Average latitude difference: 0.885
Minimum latitude difference: 0.0


In [36]:
coord_diffs.lat_diff.describe()

count    1922.000000
mean        0.884561
std         3.891852
min         0.000000
25%         0.000000
50%         0.000010
75%         0.087922
max        40.070980
Name: lat_diff, dtype: float64

In [35]:
print("Maximum longitude difference:",round(coord_diffs.lon_diff.max(),3))
print("Average longitude difference:",round(coord_diffs.lon_diff.mean(),3))
print("Minimum longitude difference:",round(coord_diffs.lon_diff.min(),3))

Maximum longitude difference: 116.977
Average longitude difference: 1.218
Minimum longitude difference: 0.0


In [37]:
coord_diffs.lon_diff.describe()

count    1922.000000
mean        1.218494
std         8.375901
min         0.000000
25%         0.000010
50%         0.030065
75%         0.179290
max       116.977020
Name: lon_diff, dtype: float64