In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('sample_data.csv')

df['timestamp'] = pd.to_datetime(df['timestamp']) # Convert timestamp column to pandas datetime object

In [3]:
df

Unnamed: 0,mmsi,timestamp,lat,lon
0,565761000,2023-03-15 00:27:44+00:00,1.268780,103.758270
1,538008084,2023-03-19 23:30:00+00:00,43.559620,10.294040
2,564654000,2023-03-12 08:22:53+00:00,1.237250,103.891350
3,529123000,2023-03-05 16:47:42+00:00,29.443670,48.930660
4,564780000,2023-03-11 06:35:20+00:00,1.277550,103.610260
...,...,...,...,...
13496,218719092,2023-03-21 08:30:00+00:00,44.168871,9.104404
13497,564654000,2023-03-13 22:42:16+00:00,1.257010,103.841010
13498,564654000,2023-03-05 10:15:11+00:00,1.280430,103.907730
13499,565761000,2023-03-19 07:30:00+00:00,1.302624,103.951899


In [4]:
df.isna().any()

mmsi         False
timestamp    False
lat          False
lon          False
dtype: bool

In [5]:
df.drop_duplicates(inplace = True)

In [6]:
df.shape #dataframe shape post cleaning

(13489, 4)

In [7]:
proximity_threshold = 0.5

# Convert degrees to radians
def haversine_formula(lat1, lon1, lat2, lon2):
    R_earth = 3440.065

    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    return R_earth * c

In [8]:
def calculate_proximity_vectorized(df, proximity_threshold):
    # Taking cross product of the dataframe with itself to compare each vessel with every other vessel
    df_cp = df.merge(df, on='timestamp')
    
    # Filter out comparisons of the same vessel (same MMSI)
    df_cp = df_cp[df_cp['mmsi_x'] != df_cp['mmsi_y']]
    
    # Calculate distances using the Haversine formula
    distances = haversine_formula(
        df_cp['lat_x'].values,
        df_cp['lon_x'].values,
        df_cp['lat_y'].values,
        df_cp['lon_y'].values
    )

    # Filter based on proximity threshold
    df_cp['distance'] = distances
    proximity_df = df_cp[df_cp['distance'] <= proximity_threshold]
    
    # Group by MMSI to aggregate proximity events
    proximity_df = proximity_df.groupby('mmsi_x').agg({
        'mmsi_y': lambda x: list(set(x)),
        'timestamp': 'first'
    }).reset_index()
    
    # Rename columns for clarity
    proximity_df.columns = ['mmsi', 'vessel_proximity', 'timestamp']
    
    return proximity_df

In [10]:
# Apply proximity calculation
proximity_vessels = calculate_proximity_vectorized(df, proximity_threshold)

In [11]:
proximity_vessels

Unnamed: 0,mmsi,vessel_proximity,timestamp
0,218719092,"[232006548, 232345740, 875832716, 889799564]",2023-03-22 17:30:00+00:00
1,232006548,"[218719092, 232345740, 875832716, 889799564]",2023-03-22 10:30:00+00:00
2,232345740,"[218719092, 232006548, 875832716, 889799564]",2023-03-17 02:30:00+00:00
3,352002300,"[564780000, 563014650, 565761000, 563078430]",2023-03-13 20:30:00+00:00
4,352656000,[538008064],2023-03-14 13:30:00+00:00
5,538008064,[352656000],2023-03-14 13:30:00+00:00
6,563014650,"[564780000, 565761000, 564654000, 352002300, 5...",2023-03-05 16:30:00+00:00
7,563078430,"[564780000, 563014650, 352002300, 565761000]",2023-03-05 16:30:00+00:00
8,564654000,[563014650],2023-03-06 02:30:00+00:00
9,564780000,"[565761000, 563014650, 352002300, 563078430]",2023-03-05 16:30:00+00:00
