In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from ise.data.ISMIP6.feature_engineer import FeatureEngineer
from ise.utils import functions as f

In [58]:
np.round(np.random.normal(loc=352, scale=128))

304.0

In [None]:
d = pd.read_csv(r"/users/pvankatw/research/ise/dataset/dataset.csv")

In [None]:
plt.hist(d.sle)

In [None]:
fe = FeatureEngineer(
        ice_sheet='AIS',
        data=pd.read_csv(r"/users/pvankatw/research/ise/dataset/dataset.csv"),
        fill_mrro_nans=None,
        split_dataset=False,
        output_directory=None,
)

In [None]:
fe.data = fe.data.drop(columns='mrro_anomaly')

In [None]:
fe.add_model_characteristics()

In [None]:
fe.drop_outliers('quantile', 'sle', quantiles=[0.005, 1-0.005])

In [None]:
fe.scale_data()

In [None]:
fe.add_lag_variables(5)

In [None]:
fe.data

In [None]:
plt.hist(fe.data.sle)

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

# -----------------------
# 1) Setup
# -----------------------
df = fe.data.copy()  # your 291,024-row DataFrame
# df = df[(df.model != "fETISh_16km") & (df.model != "fETISh_32km")]

# Choose your value column to plot
# e.g., "smb_anomaly", "temperature", "pr_anomaly", etc.
y_col = "sle"   # <-- change to whatever you want

# Identify time column
time_col = "time" if "time" in df.columns else "year"

# Basic checks
assert time_col in df.columns, f"Couldn't find a 'time' or 'year' column."
assert "model" in df.columns, "Expected a 'model' column."

# -----------------------
# 2) Infer series_id
#    Option A (recommended): Detect resets in the temporal column
# -----------------------
t = df[time_col].to_numpy()
# Start a new series whenever the time decreases or repeats (<= 0 diff)
# This handles the common case where each run restarts at the first year.
reset = np.r_[True, np.diff(t) <= 0]
series_id = np.cumsum(reset) - 1
df["series_id"] = series_id

# Sanity check: if you *know* every run is exactly 86 rows long and perfectly stacked,
# you can enforce it (uncomment this to override the above logic):
# df["series_id"] = np.repeat(np.arange(len(df)//86), 86)

# Optional: verify expected length per series (not strictly required)
# lens = df.groupby("series_id").size()
# assert lens.min() == 86 and lens.max() == 86, "Not all runs are length 86."

# -----------------------
# 3) Build traces efficiently: one trace per model, with NaN gaps between runs
#    This keeps the figure light even with thousands of runs.
# -----------------------
fig = go.Figure()

models = df["model"].astype(str).unique().tolist()
models_sorted = sorted(models)

# For nice hover labels, keep key info
hover_tmpl = (
    f"<b>Model</b>: %{{customdata[0]}}<br>"
    f"<b>Series</b>: %{{customdata[1]}}<br>"
    f"<b>{time_col}</b>: %{{x}}<br>"
    f"<b>{y_col}</b>: %{{y}}<extra></extra>"
)

# Build one scattergl per model with NaN breaks between each series_id
for m in models_sorted:
    sub = df[df["model"] == m].copy()

    # order by series then time
    sub.sort_values(["series_id", time_col], inplace=True)

    # Insert NaN rows between series to break the line visually
    # (so a single trace can draw many separate runs)
    parts = []
    custom = []
    for sid, g in sub.groupby("series_id", sort=True):
        parts.append(g[[time_col, y_col]].to_numpy())
        custom.append(np.column_stack([g["model"].astype(str).to_numpy(),
                                       np.full(len(g), sid)]))
        # add a NaN row to break the line
        parts.append(np.array([[np.nan, np.nan]]))
        custom.append(np.array([["", np.nan]]))  # placeholders

    if len(parts) == 0:
        continue

    xy = np.vstack(parts)
    cd = np.vstack(custom)

    fig.add_trace(
        go.Scattergl(
            x=xy[:, 0],
            y=xy[:, 1],
            mode="lines",
            name=m,
            line=dict(width=1),
            opacity=0.6,
            customdata=cd,
            hovertemplate=hover_tmpl,
            visible=True  # we'll control visibility with buttons below
        )
    )

# -----------------------
# 4) Layout + dropdown to filter by model
# -----------------------
# Button 0: show all models
buttons = [{
    "label": "All models",
    "method": "update",
    "args": [{"visible": [True]*len(models_sorted)}]
}]

# One button per model (show only that model)
for i, m in enumerate(models_sorted):
    vis = [False]*len(models_sorted)
    vis[i] = True
    buttons.append({
        "label": m,
        "method": "update",
        "args": [{"visible": vis}]
    })

fig.update_layout(
    title=f"Full {y_col} over {time_col} (86-year runs) — colored by model (dropdown to filter)",
    xaxis_title=time_col,
    yaxis_title=y_col,
    legend_title="Model",
    updatemenus=[{
        "buttons": buttons,
        "direction": "down",
        "showactive": True,
        "x": 1.0, "xanchor": "right",
        "y": 1.15, "yanchor": "top"
    }],
    hovermode="x unified",
    template="plotly_white",
    height=700
)

fig.show()


In [None]:
train, val, test = fe.split_data(
    train_size=0.7,
    val_size=0.15,
    test_size=0.15
)

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

# -----------------------
# 1) Setup
# -----------------------
df = train.copy()  # your 291,024-row DataFrame
# df = df[(df.model != "fETISh_16km") & (df.model != "fETISh_32km")]

# Choose your value column to plot
# e.g., "smb_anomaly", "temperature", "pr_anomaly", etc.
y_col = "sle"   # <-- change to whatever you want

# Identify time column
time_col = "time" if "time" in df.columns else "year"

# Basic checks
assert time_col in df.columns, f"Couldn't find a 'time' or 'year' column."
assert "model" in df.columns, "Expected a 'model' column."

# -----------------------
# 2) Infer series_id
#    Option A (recommended): Detect resets in the temporal column
# -----------------------
t = df[time_col].to_numpy()
# Start a new series whenever the time decreases or repeats (<= 0 diff)
# This handles the common case where each run restarts at the first year.
reset = np.r_[True, np.diff(t) <= 0]
series_id = np.cumsum(reset) - 1
df["series_id"] = series_id

# Sanity check: if you *know* every run is exactly 86 rows long and perfectly stacked,
# you can enforce it (uncomment this to override the above logic):
# df["series_id"] = np.repeat(np.arange(len(df)//86), 86)

# Optional: verify expected length per series (not strictly required)
# lens = df.groupby("series_id").size()
# assert lens.min() == 86 and lens.max() == 86, "Not all runs are length 86."

# -----------------------
# 3) Build traces efficiently: one trace per model, with NaN gaps between runs
#    This keeps the figure light even with thousands of runs.
# -----------------------
fig = go.Figure()

models = df["model"].astype(str).unique().tolist()
models_sorted = sorted(models)

# For nice hover labels, keep key info
hover_tmpl = (
    f"<b>Model</b>: %{{customdata[0]}}<br>"
    f"<b>Series</b>: %{{customdata[1]}}<br>"
    f"<b>{time_col}</b>: %{{x}}<br>"
    f"<b>{y_col}</b>: %{{y}}<extra></extra>"
)

# Build one scattergl per model with NaN breaks between each series_id
for m in models_sorted:
    sub = df[df["model"] == m].copy()

    # order by series then time
    sub.sort_values(["series_id", time_col], inplace=True)

    # Insert NaN rows between series to break the line visually
    # (so a single trace can draw many separate runs)
    parts = []
    custom = []
    for sid, g in sub.groupby("series_id", sort=True):
        parts.append(g[[time_col, y_col]].to_numpy())
        custom.append(np.column_stack([g["model"].astype(str).to_numpy(),
                                       np.full(len(g), sid)]))
        # add a NaN row to break the line
        parts.append(np.array([[np.nan, np.nan]]))
        custom.append(np.array([["", np.nan]]))  # placeholders

    if len(parts) == 0:
        continue

    xy = np.vstack(parts)
    cd = np.vstack(custom)

    fig.add_trace(
        go.Scattergl(
            x=xy[:, 0],
            y=xy[:, 1],
            mode="lines",
            name=m,
            line=dict(width=1),
            opacity=0.6,
            customdata=cd,
            hovertemplate=hover_tmpl,
            visible=True  # we'll control visibility with buttons below
        )
    )

# -----------------------
# 4) Layout + dropdown to filter by model
# -----------------------
# Button 0: show all models
buttons = [{
    "label": "All models",
    "method": "update",
    "args": [{"visible": [True]*len(models_sorted)}]
}]

# One button per model (show only that model)
for i, m in enumerate(models_sorted):
    vis = [False]*len(models_sorted)
    vis[i] = True
    buttons.append({
        "label": m,
        "method": "update",
        "args": [{"visible": vis}]
    })

fig.update_layout(
    title=f"Train {y_col} over {time_col} (86-year runs) — colored by model (dropdown to filter)",
    xaxis_title=time_col,
    yaxis_title=y_col,
    legend_title="Model",
    updatemenus=[{
        "buttons": buttons,
        "direction": "down",
        "showactive": True,
        "x": 1.0, "xanchor": "right",
        "y": 1.15, "yanchor": "top"
    }],
    hovermode="x unified",
    template="plotly_white",
    height=700
)

fig.show()


In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

# -----------------------
# 1) Setup
# -----------------------
df = val.copy()  # your 291,024-row DataFrame
# df = df[(df.model != "fETISh_16km") & (df.model != "fETISh_32km")]

# Choose your value column to plot
# e.g., "smb_anomaly", "temperature", "pr_anomaly", etc.
y_col = "sle"   # <-- change to whatever you want

# Identify time column
time_col = "time" if "time" in df.columns else "year"

# Basic checks
assert time_col in df.columns, f"Couldn't find a 'time' or 'year' column."
assert "model" in df.columns, "Expected a 'model' column."

# -----------------------
# 2) Infer series_id
#    Option A (recommended): Detect resets in the temporal column
# -----------------------
t = df[time_col].to_numpy()
# Start a new series whenever the time decreases or repeats (<= 0 diff)
# This handles the common case where each run restarts at the first year.
reset = np.r_[True, np.diff(t) <= 0]
series_id = np.cumsum(reset) - 1
df["series_id"] = series_id

# Sanity check: if you *know* every run is exactly 86 rows long and perfectly stacked,
# you can enforce it (uncomment this to override the above logic):
# df["series_id"] = np.repeat(np.arange(len(df)//86), 86)

# Optional: verify expected length per series (not strictly required)
# lens = df.groupby("series_id").size()
# assert lens.min() == 86 and lens.max() == 86, "Not all runs are length 86."

# -----------------------
# 3) Build traces efficiently: one trace per model, with NaN gaps between runs
#    This keeps the figure light even with thousands of runs.
# -----------------------
fig = go.Figure()

models = df["model"].astype(str).unique().tolist()
models_sorted = sorted(models)

# For nice hover labels, keep key info
hover_tmpl = (
    f"<b>Model</b>: %{{customdata[0]}}<br>"
    f"<b>Series</b>: %{{customdata[1]}}<br>"
    f"<b>{time_col}</b>: %{{x}}<br>"
    f"<b>{y_col}</b>: %{{y}}<extra></extra>"
)

# Build one scattergl per model with NaN breaks between each series_id
for m in models_sorted:
    sub = df[df["model"] == m].copy()

    # order by series then time
    sub.sort_values(["series_id", time_col], inplace=True)

    # Insert NaN rows between series to break the line visually
    # (so a single trace can draw many separate runs)
    parts = []
    custom = []
    for sid, g in sub.groupby("series_id", sort=True):
        parts.append(g[[time_col, y_col]].to_numpy())
        custom.append(np.column_stack([g["model"].astype(str).to_numpy(),
                                       np.full(len(g), sid)]))
        # add a NaN row to break the line
        parts.append(np.array([[np.nan, np.nan]]))
        custom.append(np.array([["", np.nan]]))  # placeholders

    if len(parts) == 0:
        continue

    xy = np.vstack(parts)
    cd = np.vstack(custom)

    fig.add_trace(
        go.Scattergl(
            x=xy[:, 0],
            y=xy[:, 1],
            mode="lines",
            name=m,
            line=dict(width=1),
            opacity=0.6,
            customdata=cd,
            hovertemplate=hover_tmpl,
            visible=True  # we'll control visibility with buttons below
        )
    )

# -----------------------
# 4) Layout + dropdown to filter by model
# -----------------------
# Button 0: show all models
buttons = [{
    "label": "All models",
    "method": "update",
    "args": [{"visible": [True]*len(models_sorted)}]
}]

# One button per model (show only that model)
for i, m in enumerate(models_sorted):
    vis = [False]*len(models_sorted)
    vis[i] = True
    buttons.append({
        "label": m,
        "method": "update",
        "args": [{"visible": vis}]
    })

fig.update_layout(
    title=f"Val {y_col} over {time_col} (86-year runs) — colored by model (dropdown to filter)",
    xaxis_title=time_col,
    yaxis_title=y_col,
    legend_title="Model",
    updatemenus=[{
        "buttons": buttons,
        "direction": "down",
        "showactive": True,
        "x": 1.0, "xanchor": "right",
        "y": 1.15, "yanchor": "top"
    }],
    hovermode="x unified",
    template="plotly_white",
    height=700
)

fig.show()


In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

# -----------------------
# 1) Setup
# -----------------------
df = test.copy()  # your 291,024-row DataFrame
# df = df[(df.model != "fETISh_16km") & (df.model != "fETISh_32km")]

# Choose your value column to plot
# e.g., "smb_anomaly", "temperature", "pr_anomaly", etc.
y_col = "sle"   # <-- change to whatever you want

# Identify time column
time_col = "time" if "time" in df.columns else "year"

# Basic checks
assert time_col in df.columns, f"Couldn't find a 'time' or 'year' column."
assert "model" in df.columns, "Expected a 'model' column."

# -----------------------
# 2) Infer series_id
#    Option A (recommended): Detect resets in the temporal column
# -----------------------
t = df[time_col].to_numpy()
# Start a new series whenever the time decreases or repeats (<= 0 diff)
# This handles the common case where each run restarts at the first year.
reset = np.r_[True, np.diff(t) <= 0]
series_id = np.cumsum(reset) - 1
df["series_id"] = series_id

# Sanity check: if you *know* every run is exactly 86 rows long and perfectly stacked,
# you can enforce it (uncomment this to override the above logic):
# df["series_id"] = np.repeat(np.arange(len(df)//86), 86)

# Optional: verify expected length per series (not strictly required)
# lens = df.groupby("series_id").size()
# assert lens.min() == 86 and lens.max() == 86, "Not all runs are length 86."

# -----------------------
# 3) Build traces efficiently: one trace per model, with NaN gaps between runs
#    This keeps the figure light even with thousands of runs.
# -----------------------
fig = go.Figure()

models = df["model"].astype(str).unique().tolist()
models_sorted = sorted(models)

# For nice hover labels, keep key info
hover_tmpl = (
    f"<b>Model</b>: %{{customdata[0]}}<br>"
    f"<b>Series</b>: %{{customdata[1]}}<br>"
    f"<b>{time_col}</b>: %{{x}}<br>"
    f"<b>{y_col}</b>: %{{y}}<extra></extra>"
)

# Build one scattergl per model with NaN breaks between each series_id
for m in models_sorted:
    sub = df[df["model"] == m].copy()

    # order by series then time
    sub.sort_values(["series_id", time_col], inplace=True)

    # Insert NaN rows between series to break the line visually
    # (so a single trace can draw many separate runs)
    parts = []
    custom = []
    for sid, g in sub.groupby("series_id", sort=True):
        parts.append(g[[time_col, y_col]].to_numpy())
        custom.append(np.column_stack([g["model"].astype(str).to_numpy(),
                                       np.full(len(g), sid)]))
        # add a NaN row to break the line
        parts.append(np.array([[np.nan, np.nan]]))
        custom.append(np.array([["", np.nan]]))  # placeholders

    if len(parts) == 0:
        continue

    xy = np.vstack(parts)
    cd = np.vstack(custom)

    fig.add_trace(
        go.Scattergl(
            x=xy[:, 0],
            y=xy[:, 1],
            mode="lines",
            name=m,
            line=dict(width=1),
            opacity=0.6,
            customdata=cd,
            hovertemplate=hover_tmpl,
            visible=True  # we'll control visibility with buttons below
        )
    )

# -----------------------
# 4) Layout + dropdown to filter by model
# -----------------------
# Button 0: show all models
buttons = [{
    "label": "All models",
    "method": "update",
    "args": [{"visible": [True]*len(models_sorted)}]
}]

# One button per model (show only that model)
for i, m in enumerate(models_sorted):
    vis = [False]*len(models_sorted)
    vis[i] = True
    buttons.append({
        "label": m,
        "method": "update",
        "args": [{"visible": vis}]
    })

fig.update_layout(
    title=f"Test {y_col} over {time_col} (86-year runs) — colored by model (dropdown to filter)",
    xaxis_title=time_col,
    yaxis_title=y_col,
    legend_title="Model",
    updatemenus=[{
        "buttons": buttons,
        "direction": "down",
        "showactive": True,
        "x": 1.0, "xanchor": "right",
        "y": 1.15, "yanchor": "top"
    }],
    hovermode="x unified",
    template="plotly_white",
    height=700
)

fig.show()


In [None]:
import matplotlib.pyplot as plt

# 3-panel subplot for histograms of dataset.sle
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)

# Plot histograms
axes[0].hist(train['sle'], bins=30, color='steelblue', alpha=0.7, edgecolor='black')
axes[0].set_title("Train")
axes[0].set_xlabel("SLE")
axes[0].set_ylabel("Frequency")

axes[1].hist(val['sle'], bins=30, color='seagreen', alpha=0.7, edgecolor='black')
axes[1].set_title("Validation")
axes[1].set_xlabel("SLE")

axes[2].hist(test['sle'], bins=30, color='darkorange', alpha=0.7, edgecolor='black')
axes[2].set_title("Test")
axes[2].set_xlabel("SLE")

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

# Collect summary stats
summary = pd.DataFrame({
    "Train": [
        train['sle'].mean(),
        train['sle'].std(),
        train['sle'].min(),
        train['sle'].max(),
        train['sle'].median(),
        train['sle'].quantile(0.25),
        train['sle'].quantile(0.75),
        len(train)
    ],
    "Validation": [
        val['sle'].mean(),
        val['sle'].std(),
        val['sle'].min(),
        val['sle'].max(),
        val['sle'].median(),
        val['sle'].quantile(0.25),
        val['sle'].quantile(0.75),
        len(val)
    ],
    "Test": [
        test['sle'].mean(),
        test['sle'].std(),
        test['sle'].min(),
        test['sle'].max(),
        test['sle'].median(),
        test['sle'].quantile(0.25),
        test['sle'].quantile(0.75),
        len(test)
    ]
}, index=[
    "Mean",
    "Std Dev",
    "Min",
    "Max",
    "Median",
    "25th Percentile",
    "75th Percentile",
    "Count"
])

display(summary.round(3))


In [None]:
train.to_csv(r"/users/pvankatw/research/ise/dataset/train.csv")
val.to_csv(r"/users/pvankatw/research/ise/dataset/val.csv")
test.to_csv(r"/users/pvankatw/research/ise/dataset/test.csv")

In [None]:
X_train, y_train = f.get_X_y(pd.read_csv(f"/users/pvankatw/research/ise/dataset/train.csv"), 'sectors', return_format='numpy',)

In [None]:
X_train

In [None]:
np.isnan(X_train).any()

In [None]:
X_train.shape