AIS Data

In [7]:
# Stream-read data1..data7, extract tankers (VesselType==80), compute overlaps and produce a map
import pandas as pd
import folium
from folium import IFrame, DivIcon, PolyLine
from folium.plugins import Fullscreen
import numpy as np
from matplotlib import cm
from matplotlib.colors import Normalize, rgb2hex, LinearSegmentedColormap

# Helper: find a timestamp column in columns list
def find_timestamp_col_from_cols(cols):
    candidates = ['BaseDateTime','Timestamp','Date','Datetime','date','time','BaseDateTimeUTC','DateTimeUTC','received_at','timestamp']
    for c in candidates:
        if c in cols:
            return c
    return None

# Candidate length column names
length_candidates = ['Length','LengthOverall','LOA','LENGTH','length','Length(m)']

# Small helper to stream-read only rows where VesselType == 80
def read_tankers(path, chunksize=100_000):
    desired = ['MMSI','BaseDateTime','LAT','LON','SOG','COG','Heading','VesselName','IMO','CallSign','VesselType','Status','Length','Width','Draft','Cargo','TransceiverClass']
    hdr = pd.read_csv(path, nrows=0).columns.tolist()
    ts_col = find_timestamp_col_from_cols(hdr)
    usecols = [c for c in desired if c in hdr]
    for c in length_candidates:
        if c in hdr and c not in usecols:
            usecols.append(c)
    if ts_col and ts_col not in usecols:
        usecols.append(ts_col)
    pieces = []
    for chunk in pd.read_csv(path, usecols=usecols, chunksize=chunksize, low_memory=False):
        if 'VesselType' in chunk.columns:
            filtered = chunk[chunk['VesselType'] == 80]
        else:
            filtered = pd.DataFrame(columns=chunk.columns)
        if not filtered.empty:
            pieces.append(filtered)
    if pieces:
        df = pd.concat(pieces, ignore_index=True, sort=False)
    else:
        df = pd.DataFrame(columns=usecols)
    return df, ts_col

# Files in chronological order (data.csv oldest ... data7.csv newest)
files = ['data.csv','data2.csv','data3.csv','data4.csv','data5.csv','data6.csv','data7.csv']

# If the cell is re-run, avoid reloading large CSVs: use a simple in-memory cache flag
if globals().get('_tankers_cache_loaded', False):
    print('Tanker rows already loaded in memory — using cached results.')
    try:
        for f, s in zip(files, u_sets):
            print(f'Unique tankers in {f}: {len(s)}')
        common_all = set.intersection(*[s for s in u_sets if s]) if any(u_sets) else set()
        print(f'Present in all files: {len(common_all)}')
    except NameError:
        print('Cached variables partially missing — will reload from files')
        _tankers_cache_loaded = False

# Read tanker rows from each file (streaming) if not already loaded
if not globals().get('_tankers_cache_loaded', False):
    print('Reading tanker rows from files (only tanker rows will be loaded)...')
    dfs = []
    ts_cols = []
    u_sets = []
    for f in files:
        try:
            df, ts = read_tankers(f)
        except FileNotFoundError:
            print(f'File not found: {f} — skipping')
            df = pd.DataFrame()
            ts = None
        if not df.empty:
            df['__source'] = f
        dfs.append(df)
        ts_cols.append(ts)
        if 'MMSI' in df.columns:
            u_sets.append(set(df['MMSI'].dropna().unique()))
        else:
            u_sets.append(set())

    # Print unique counts per file
    for f, s in zip(files, u_sets):
        print(f'Unique tankers in {f}: {len(s)}')

    # Print some intersection stats
    all_u = [s for s in u_sets]
    if all_u:
        common_all = set.intersection(*[s for s in all_u if s]) if any(all_u) else set()
        print(f'Present in all files: {len(common_all)}')

    # unify per-file ts column names
    for df, ts in zip(dfs, ts_cols):
        if ts and ts in df.columns:
            df['_ts'] = pd.to_datetime(df[ts], errors='coerce')
        else:
            df['_ts'] = pd.NaT

    # Combine datasets
    t_all = pd.concat(dfs, ignore_index=True, sort=False) if dfs else pd.DataFrame()

    # Build list of all MMSIs
    all_mmsis = sorted(set(t_all['MMSI'].dropna().unique())) if 'MMSI' in t_all.columns else []

    # Determine last position per MMSI preferring newest file (data7 highest priority)
    last_positions = []
    for mmsi in all_mmsis:
        row = None
        # iterate files from newest to oldest
        for df, f in zip(dfs[::-1], files[::-1]):
            if df is None or df.empty:
                continue
            grp = df[df['MMSI'] == mmsi]
            # choose timestamp column for that df if available
            ts = find_timestamp_col_from_cols(df.columns)
            if ts and ts in grp.columns:
                grp = grp.sort_values(ts)
            if len(grp):
                row = grp.iloc[-1]
                break
        if row is not None:
            last_positions.append(row)
    last_df = pd.DataFrame(last_positions) if last_positions else pd.DataFrame(columns=t_all.columns)

    # Overall latest timestamp for disappearance detection
    overall_latest = None
    if '_ts' in t_all.columns and t_all['_ts'].notna().any():
        overall_latest = t_all['_ts'].max()

    # Compute length statistics from available rows
    lengths = []
    for _, r in t_all.iterrows():
        L = None
        for c in length_candidates:
            if c in t_all.columns and pd.notna(r.get(c)):
                try:
                    L = float(r.get(c))
                    break
                except Exception:
                    continue
        if L is not None:
            lengths.append(L)
    if lengths:
        min_len = float(np.nanmin(lengths))
        max_len = float(np.nanmax(lengths))
    else:
        min_len, max_len = 0.0, 1.0
    norm = Normalize(vmin=min_len, vmax=max_len)
    # custom colormap: blue -> purple -> red
    cmap = LinearSegmentedColormap.from_list('blue_purple_red', ['blue', 'purple', 'red'])

    def color_for_length(L):
        if L is None:
            return '#800080'  # purple default for unknown
        rgb = cmap(norm(L))[:3]
        return rgb2hex(rgb)

    # Helper to get length from a row
    def get_length_from_row(r):
        for c in length_candidates:
            if c in r and pd.notna(r.get(c)):
                try:
                    return float(r.get(c))
                except Exception:
                    continue
        return None

    # Map center
    if len(last_df):
        center = [last_df['LAT'].astype(float).mean(), last_df['LON'].astype(float).mean()]
    else:
        center = [0, 0]

    # Create map
    m = folium.Map(location=center, zoom_start=5, tiles='CartoDB dark_matter')
    folium.TileLayer('https://tiles.openseamap.org/seamark/{z}/{x}/{y}.png', name='SeaMarks', attr='OpenSeaMap - seamark').add_to(m)
    Fullscreen().add_to(m)

    # Plot trails (dashed) using timestamps where available
    for mmsi in all_mmsis:
        group = t_all[t_all['MMSI'] == mmsi].copy()
        if '_ts' in group.columns and group['_ts'].notna().any():
            group = group.sort_values('_ts')
        coords = [(float(r['LAT']), float(r['LON'])) for _, r in group.iterrows() if pd.notna(r.get('LAT')) and pd.notna(r.get('LON'))]
        if len(coords) >= 2:
            trail_color = color_for_length(get_length_from_row(group.iloc[-1]))
            PolyLine(locations=coords, color=trail_color, weight=2, opacity=0.85, dash_array='6,6').add_to(m)

    # Add last-position markers with arrow size based on ship length
    for _, row in last_df.iterrows():
        lat = row.get('LAT')
        lon = row.get('LON')
        if pd.isna(lat) or pd.isna(lon):
            continue
        mmsi = row.get('MMSI')
        heading = row.get('Heading')
        try:
            heading = float(heading)
        except Exception:
            heading = 0.0
        ship_length = get_length_from_row(row)
        # map length to size: emphasize relative differences but reduce overall sizes
        if ship_length is None or max_len == min_len:
            svg_size = 10
        else:
            frac = (ship_length - min_len) / max(1e-6, (max_len - min_len))
            raw = 10 + (frac ** 1.7) * 60
            svg_size = max(8, int(raw * 0.5))
        # disappearance detection
        last_ts = row.get('_ts') if pd.notna(row.get('_ts')) else None
        disappeared = False
        if overall_latest is not None and last_ts is not None and pd.notna(last_ts):
            try:
                disappeared = (overall_latest - pd.to_datetime(last_ts)) > pd.Timedelta(days=1)
            except Exception:
                disappeared = False
        ex_html = '<span style="position:absolute;top:0;right:0;color:#ff3333;font-weight:bold;font-size:16px;">&#x2757;</span>' if disappeared else ''
        color = color_for_length(ship_length)
        # smaller arrows overall; size differences more noticeable
        svg = f"""<div style=\"position:relative;width:{svg_size}px;height:{svg_size}px;display:flex;align-items:center;justify-content:center;\">{ex_html}
    <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"{svg_size}\" height=\"{svg_size}\" viewBox=\"0 0 24 24\" style=\"transform: rotate({heading}deg); transform-origin: 12px 12px;\">
          <path d=\"M12 2 L19 21 L12 17 L5 21 Z\" fill=\"{color}\" stroke=\"#000000\" stroke-width=\"0.6\" />
        </svg>
        </div>"""
        icon = DivIcon(html=svg)
        source = row.get('__source', '')
        ts_val = row.get('_ts')
        popup_html = f"MMSI: {mmsi}<br>Vessel: {row.get('VesselName','')}<br>Length: {ship_length if ship_length is not None else 'N/A'} m<br>Speed (SOG): {row.get('SOG','')}<br>Heading: {heading}<br>Source: {source}<br>Time: {ts_val}"
        iframe = IFrame(popup_html, width=340, height=140)
        popup = folium.Popup(iframe, max_width=380)
        folium.Marker(location=[float(lat), float(lon)], icon=icon, popup=popup).add_to(m)

    # Finish and save
    folium.LayerControl().add_to(m)
    m.save('tankers_last_positions_map.html')

    # mark cache loaded so future runs are faster and won't reprint full read messages
    _tankers_cache_loaded = True


Reading tanker rows from files (only tanker rows will be loaded)...
Unique tankers in data.csv: 463
Unique tankers in data2.csv: 446
Unique tankers in data3.csv: 441
Unique tankers in data4.csv: 449
Unique tankers in data5.csv: 456
Unique tankers in data6.csv: 474
Unique tankers in data7.csv: 397
Present in all files: 241
Unique tankers in data.csv: 463
Unique tankers in data2.csv: 446
Unique tankers in data3.csv: 441
Unique tankers in data4.csv: 449
Unique tankers in data5.csv: 456
Unique tankers in data6.csv: 474
Unique tankers in data7.csv: 397
Present in all files: 241
