In [None]:
import os
from Dataloader import *
from utils import download, overhaul_segments, plot_trajectory_on_map

import folium
import zipfile
import requests

import pandas as pd
import pyarrow
import pyarrow.parquet
import matplotlib.pyplot as plt

In [None]:
data_links = {"Jan": "http://aisdata.ais.dk/2023/aisdk-2023-01.zip",
              "Feb": "http://aisdata.ais.dk/2023/aisdk-2023-02.zip",
              "Mar": "http://aisdata.ais.dk/2023/aisdk-2023-03.zip",
              "Apr": "http://aisdata.ais.dk/2023/aisdk-2023-04.zip",
              "May": "http://aisdata.ais.dk/2023/aisdk-2023-05.zip",
              "Jun": "http://aisdata.ais.dk/2023/aisdk-2023-06.zip",
              "Jul": "http://aisdata.ais.dk/2023/aisdk-2023-07.zip",
              "Aug": "http://aisdata.ais.dk/2023/aisdk-2023-08.zip",
              "Sep": "http://aisdata.ais.dk/2023/aisdk-2023-09.zip",
              "Oct": "http://aisdata.ais.dk/2023/aisdk-2023-10.zip",
              "Nov": "http://aisdata.ais.dk/2023/aisdk-2023-11.zip",
              "Dec": "http://aisdata.ais.dk/2023/aisdk-2023-12.zip"}

data_dir = "../data/unprocessed_data"
end_dir = "../data/processed_data"
os.makedirs(data_dir, exist_ok=True)
os.makedirs(end_dir, exist_ok=True)

for month, link in data_links.items():
    filename = link.split("/")[-1]
    filepath = os.path.join(data_dir, filename)

    # Extract month number from filename
    month_num = filename.split("-")[2].replace(".zip", "")

    # Check if all parquet files for this month already exist
    # Quick check: if folder has files matching this month pattern
    existing_parquets = [f for f in os.listdir(end_dir) if f.startswith(f"aisdk-2023-{month_num}-") and f.endswith('.parquet')]
    
    # A month should have 28-31 parquet files
    if len(existing_parquets) >= 28:
        print(f"‚è≠Ô∏è  Skipping month {month}: {len(existing_parquets)} parquet files already exist")
        continue

    # Download if needed
    if os.path.exists(filepath):
        print(f"‚è≠Ô∏è  Skipping download for month {month}: {filepath} (already exists)")
    else:
        print(f"Downloading data for {month}...")
        download(link, filepath)
        print(f"Downloaded data for {month}")

    # Open ZIP once and process all CSV files
    with zipfile.ZipFile(filepath, 'r') as zip_ref:
        csv_files = [f for f in zip_ref.namelist() if f.endswith('.csv')]
        print(f"Found {len(csv_files)} CSV files in {filename}")
        
        for csv_filename in csv_files:
            output_filename = csv_filename.replace('.csv', '.parquet')
            output_path = os.path.join(end_dir, output_filename)
            
            if os.path.exists(output_path):
                print(f"‚è≠Ô∏è  Skipping {output_filename} (already exists)")
                continue
            
            print(f"üìù Processing: {csv_filename}")
            
            data = Dataloader(
                file_path="",
                out_path=output_path,
                zip_path=filepath,
                csv_internal_path=csv_filename
            )
            data.clean_data()

    print(f"‚úÖ Processed {month}")

    # Remove ZIP after processing 
    os.remove(filepath)
    print(f"üóëÔ∏è  Removed {filename}")

print("üéâ All data processed")

# Plotting the clean data

In [None]:
path = "../data/processed_data"
dataloader = Dataloader(out_path=path)
# df = dataloader.load_data(date_folders = ["aisdk-2023-01-01.parquet","aisdk-2023-01-02.parquet","aisdk-2023-01-03.parquet",
#                                           "aisdk-2023-01-04.parquet","aisdk-2023-01-05.parquet","aisdk-2023-01-06.parquet",
#                                           "aisdk-2023-01-07.parquet","aisdk-2023-01-08.parquet","aisdk-2023-01-09.parquet",
#                                           "aisdk-2023-01-10.parquet"]) # for specific files
df = dataloader.load_data()  # load all files in the processed_data folder
# Ensure ship and segment can be told apart by adding column for date
df['Date'] = df['Timestamp'].dt.date

Overhauling the segment method from the data cleaning function to ensure overnight segments are not split

In [None]:
df = overhaul_segments(df)
df.drop(columns=['Segment'], inplace=True)
df.rename(columns={"Segment_ID": "Segment"}, inplace=True)
df


In [None]:
plot_trajectory_on_map(df, percentage_of_vessels=0.5)