In [38]:
import pandas as pd

In [54]:
data = pd.read_pickle(r"C:\Users\User\bd24_project_m8_b\volumes\kafka-producer_volume\taxi_data.pkl")

In [55]:
data.shape

(17673320, 4)

In [59]:
data.drop_duplicates(inplace=True)
data.shape

(16335823, 4)

In [60]:
end_of_trip = (data['latitude'] == -1) & (data['longitude'] == -1)
data_filtered = data[~end_of_trip]

In [65]:
data[end_of_trip].shape

(10336, 4)

In [61]:
duplicates = data_filtered[data_filtered.duplicated(subset=['taxi_id', 'timestamp'], keep=False)]

In [68]:
duplicates.taxi_id.value_counts()

6275    204
3557    130
3579    110
3015    106
9754     80
       ... 
262       2
5906      2
6088      2
3039      2
9190      2
Name: taxi_id, Length: 1876, dtype: int64

In [71]:
duplicates[duplicates['taxi_id']==6275].timestamp.value_counts()

2008-02-02 13:44:33    2
2008-02-05 18:44:48    2
2008-02-06 15:05:38    2
2008-02-06 14:34:29    2
2008-02-06 14:17:44    2
                      ..
2008-02-04 14:13:07    2
2008-02-04 13:32:03    2
2008-02-04 12:52:06    2
2008-02-04 12:30:47    2
2008-02-08 17:14:31    2
Name: timestamp, Length: 102, dtype: int64

In [5]:
data.shape

(16335823, 4)

In [6]:
data.head(5)

Unnamed: 0,taxi_id,timestamp,longitude,latitude
10329518,6275,2008-02-02 13:30:44,116.36838,39.90484
4120760,3015,2008-02-02 13:30:44,116.41036,39.89171
5236504,3579,2008-02-02 13:30:44,116.40048,39.9035
13109925,7659,2008-02-02 13:30:45,116.35203,39.84086
4120761,3015,2008-02-02 13:30:45,116.41028,39.8917


In [7]:
df = data.copy()

In [8]:
import pandas as pd
from math import radians, sin, cos, sqrt, asin

# Define haversine formula
def haversine(lat1, lon1, lat2, lon2):
    radius_earth = 6372.8

    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    lat1 = radians(lat1)
    lat2 = radians(lat2)

    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * asin(sqrt(a))

    return radius_earth * c

# Forbidden City coordinates
forbidden_city_coords = (39.9163447, 116.3971556)

df['distance_from_forbidden_city'] = df.apply(lambda row: haversine(forbidden_city_coords[0], forbidden_city_coords[1], row['latitude'], row['longitude']), axis=1)
df_filtered = df[df['distance_from_forbidden_city'] <= 15]

In [9]:
df_filtered.distance_from_forbidden_city.max()

14.999994003106663

In [10]:
df_filtered.shape

(11509944, 5)

In [11]:
# Convert timestamp to datetime
df_filtered['timestamp'] = pd.to_datetime(df_filtered['timestamp'])

# Sort by taxi_id and timestamp
df_filtered = df_filtered.sort_values(by=['taxi_id', 'timestamp'])

# Calculate distance between consecutive points for each taxi
df_filtered['previous_latitude'] = df_filtered.groupby('taxi_id')['latitude'].shift(1)
df_filtered['previous_longitude'] = df_filtered.groupby('taxi_id')['longitude'].shift(1)
df_filtered['previous_timestamp'] = df_filtered.groupby('taxi_id')['timestamp'].shift(1)

# Calculate distance between consecutive points
df_filtered['distance_between_points'] = df_filtered.apply(lambda row: haversine(row['latitude'], row['longitude'], row['previous_latitude'], row['previous_longitude']), axis=1)

# Define abnormal jumps (customize as needed, here considering > 15 km as abnormal for demonstration)
abnormal_jumps = df_filtered[df_filtered['distance_between_points'] > 15]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['timestamp'] = pd.to_datetime(df_filtered['timestamp'])


In [12]:
abnormal_jumps.shape

(9076, 9)

In [13]:
abnormal_jumps.distance_between_points.max()

29.295733179836752

In [14]:
# Calculate time difference in hours
df_filtered['time_diff_hours'] = df_filtered['timestamp'] - df_filtered['previous_timestamp']
df_filtered['time_diff_hours'] = df_filtered['time_diff_hours'].dt.total_seconds() / 3600.0

# Calculate speed (km/h)
df_filtered['speed'] = df_filtered['distance_between_points'] / df_filtered['time_diff_hours']

# Identify instances with speed > 50 km/h
high_speed_instances = df_filtered[df_filtered['speed'] > 50]

In [15]:
high_speed_instances[high_speed_instances['speed']>150].shape

(46548, 11)

In [17]:
# Initialize variables to track total distance and results
total_distance = 0.0
distance_over_time = []

# Iterate through each record to calculate real-time total distance traveled
for i in range(1, len(df_filtered)):
    # Current and previous record
    current = df_filtered.iloc[i]
    previous = df_filtered.iloc[i - 1]
    
    # Check if the same taxi and consecutive records
    if current['taxi_id'] == previous['taxi_id']:
        distance = haversine(current['latitude'], current['longitude'], previous['latitude'], previous['longitude'])
        total_distance += distance
    
    # Track total distance at the current timestamp
    distance_over_time.append((current['timestamp'], total_distance))

# Convert to DataFrame for better visualization
distance_over_time_df = pd.DataFrame(distance_over_time, columns=['timestamp', 'total_distance'])

In [None]:
# distance_over_time_df.to_csv("distance.csv", index=False)