In [None]:
import pandas as pd
import altair as alt

In [None]:
# Downloaded from http://aisdata.ais.dk/aisdk-2026-01-17.zip
dataPath = "/home/joajohan/Downloads/aisdk-2026-01-17.csv"

In [None]:
# Read datafile. The full dataset is huge, but we only care about
# one entry per ship, so read everything in chunks and drop duplicates
iter_csv = pd.read_csv(
    dataPath,
    iterator = True,
    chunksize = 5000
)
df = pd.concat([chunk.drop_duplicates(subset=["MMSI"]) for chunk in iter_csv])

# Finally, drop all remaining duplicates
df = df.drop_duplicates(subset=["MMSI"])

df

A large majority of the entries have `Undefined` ship types:

In [None]:
alt.Chart(df).mark_bar().encode(
    x = alt.X('count()').scale(type="log"),
    y = alt.Y("Ship type"),
)

In [None]:
alt.Chart(df).mark_tick().encode(
    x = alt.X('Length'),
    y = alt.Y("Ship type"),
) + alt.Chart(df).mark_bar(cornerRadius=10, height=5, opacity=0.2).encode(
    x = alt.X('min(Length)').title("Length (m)"),
    x2 = alt.X2('max(Length)'),
    y = alt.Y("Ship type"),
) | alt.Chart(df).mark_tick().encode(
    x = alt.X('Width'),
    y = alt.Y("Ship type").title(""),
) + alt.Chart(df).mark_bar(cornerRadius=10, height=5, opacity=0.2).encode(
    x = alt.X('min(Width)').title("Width (m)"),
    x2 = alt.X2('max(Width)'),
    y = alt.Y("Ship type"),
)

In [None]:
def scatterMarginHist(df, type):
    source = df.loc[df['Ship type'] == type]
    base = alt.Chart(source)
    base_bar = base.mark_bar(opacity=0.3, binSpacing=0)

    xscale = alt.Scale(domain=(0, 400))
    yscale = alt.Scale(domain=(0, 60))

    points = base.mark_point(size=100).encode(
        alt.X("Length").scale(xscale).title("Length (m)"),
        alt.Y("Width").scale(yscale).title("Width (m)"),
        color="Ship type",
    ).properties(
        title=type
    )

    top_hist = (
        base_bar
        .encode(
            alt.X("Length:Q").bin(
                maxbins=20, extent=xscale.domain
            ).stack(None).title(""),
            alt.Y("count()").stack(None).title(""),
            alt.Color("Ship type:N"),
        )
        .properties(height=60)
    )

    right_hist = (
        base_bar
        .encode(
            alt.Y("Width:Q")
                .bin(maxbins=20, extent=yscale.domain)
                .stack(None)
                .title(""),
            alt.X("count()").stack(None).title(""),
            alt.Color("Ship type:N"),
        )
        .properties(width=60)
    )

    return top_hist & (points | right_hist)

scatterMarginHist(df, "Cargo") | scatterMarginHist(df, "Passenger") | scatterMarginHist(df, "Tanker")