In [5]:
import pandas as pd

# Load the MDR_Location.csv file
mdr_location_file = 'MDR_Location.csv'

# Read the file with unicode encoding
mdr_df = pd.read_csv(mdr_location_file, encoding='unicode_escape')

# Remove rows with missing Latitude or Longitude values
mdr_df = mdr_df.dropna(subset=['Latitude', 'Longitude'])

# Convert Latitude and Longitude to numeric, forcing errors to NaN and then dropping those rows
mdr_df['Latitude'] = pd.to_numeric(mdr_df['Latitude'], errors='coerce')
mdr_df['Longitude'] = pd.to_numeric(mdr_df['Longitude'], errors='coerce')
mdr_df = mdr_df.dropna(subset=['Latitude', 'Longitude'])

# Keep only rows where 'FishingPort', 'LandingPlace', or 'CommercialPort' is 'Y'
valid_harbors_df = mdr_df[(mdr_df['FishingPort'] == 'Y') | 
                          (mdr_df['LandingPlace'] == 'Y') | 
                          (mdr_df['CommercialPort'] == 'Y')]

# Save the cleaned data to a new CSV file
cleaned_harbors_file = 'Cleaned_MDR_Location.csv'
valid_harbors_df.to_csv(cleaned_harbors_file, index=False)

print(f'Cleaned harbor locations saved to {cleaned_harbors_file}')


import pandas as pd
import os
from geopy.distance import geodesic
from scipy.spatial import KDTree

# Read cleaned harbor locations
cleaned_harbors_df = pd.read_csv('Cleaned_MDR_Location.csv')
harbors = cleaned_harbors_df[['Latitude', 'Longitude']].values

# Create KDTree for harbor locations
harbor_tree = KDTree(harbors)

# Define a function to check if a point is within 500m of any harbor
def is_in_harbor(lat, lon, tree, harbors, max_distance=500):
    # Query the KDTree for the nearest harbor
    distance, index = tree.query([lat, lon], k=1)
    # Convert distance to meters
    distance_meters = geodesic((lat, lon), harbors[index]).meters
    return distance_meters <= max_distance

# Define state machine states
IN_HARBOR = 'IN_HARBOR'
IN_VOYAGE = 'IN_VOYAGE'

# Directory where VMS files are stored
vms_directory = 'data'

# Process each VMS file
for file_name in os.listdir(vms_directory):
    if file_name.endswith('.csv'):
        cfr = file_name.split('.')[0]
        vms_df = pd.read_csv(os.path.join(vms_directory, file_name))
        
        # Sort by datetime
        vms_df['datetime'] = pd.to_datetime(vms_df['datetime'])
        vms_df = vms_df.sort_values(by='datetime')
        
        state = IN_HARBOR
        trips = []
        current_trip = []

        for _, row in vms_df.iterrows():
            lat, lon = row['latitude'], row['longitude']
            if state == IN_HARBOR:
                if not is_in_harbor(lat, lon, harbor_tree, harbors):
                    state = IN_VOYAGE
                    current_trip.append(row)
            elif state == IN_VOYAGE:
                if is_in_harbor(lat, lon, harbor_tree, harbors):
                    state = IN_HARBOR
                    trips.append(pd.DataFrame(current_trip))
                    current_trip = []
                else:
                    current_trip.append(row)

        # Save trips
        vessel_folder = os.path.join(vms_directory, cfr)
        os.makedirs(vessel_folder, exist_ok=True)
        for i, trip in enumerate(trips):
            trip_file_name = os.path.join(vessel_folder, f'{cfr}_{i + 1}.csv')
            trip.to_csv(trip_file_name, index=False)





Cleaned harbor locations saved to Cleaned_MDR_Location.csv


  vms_df = pd.read_csv(os.path.join(vms_directory, file_name))
  vms_df = pd.read_csv(os.path.join(vms_directory, file_name))
  vms_df = pd.read_csv(os.path.join(vms_directory, file_name))
  vms_df = pd.read_csv(os.path.join(vms_directory, file_name))
  vms_df = pd.read_csv(os.path.join(vms_directory, file_name))
  vms_df = pd.read_csv(os.path.join(vms_directory, file_name))
  vms_df = pd.read_csv(os.path.join(vms_directory, file_name))
  vms_df = pd.read_csv(os.path.join(vms_directory, file_name))
  vms_df = pd.read_csv(os.path.join(vms_directory, file_name))
  vms_df = pd.read_csv(os.path.join(vms_directory, file_name))
  vms_df = pd.read_csv(os.path.join(vms_directory, file_name))
  vms_df = pd.read_csv(os.path.join(vms_directory, file_name))
  vms_df = pd.read_csv(os.path.join(vms_directory, file_name))
  vms_df = pd.read_csv(os.path.join(vms_directory, file_name))
  vms_df = pd.read_csv(os.path.join(vms_directory, file_name))
  vms_df = pd.read_csv(os.path.join(vms_directory, file