# Sea Ice Thickness Preprocessing

We processed **NetCDF files (`heff.H*.nc`)** containing Arctic sea ice thickness into a clean, aggregated dataset.

In [None]:
import pandas as pd
import xarray as xr
from pathlib import Path
import numpy as np

In [None]:
# Base folder path
BASE_DIR = Path().resolve().parents[1]
print(BASE_DIR)
thickness_dir = BASE_DIR / "data" / "raw" / "thickness"

# Find all matching NetCDF files
files = sorted(thickness_dir.glob("heff.H*.nc"))
print(f"Looking for files in: {thickness_dir}")
print(f"Found {len(files)} files")
for f in files[:5]:
    print("Example file:", f.name)

if not files:
    raise FileNotFoundError(f"No files found in {thickness_dir} matching 'heff.H*.nc'")

all_data = []

In [None]:
# Loop through each file
for file in files:
    print(f"\nProcessing {file.name}...")
    try:
        ds = xr.open_dataset(file, decode_times=False)
    except Exception as e:
        print(f"Could not open {file.name}: {e}")
        continue

    # Required variables check
    required_vars = ["heff", "lat_scaler", "lon_scaler", "year", "month"]
    if not all(var in ds.variables for var in required_vars):
        print(f"Skipping {file.name} — missing required variables")
        continue

    # Extract dimensions
    n_time = ds.sizes["n"]
    n_lat = ds.sizes["j"]
    n_lon = ds.sizes["i"]

    # Handle year
    if ds["year"].size == 1:
        year_vals = np.repeat(ds["year"].item(), n_time)
    else:
        year_vals = ds["year"].values.flatten()

    month_vals = ds["month"].values.flatten()

    # Repeat year/month for all grid points
    years = np.repeat(year_vals, n_lat * n_lon)
    months = np.repeat(month_vals, n_lat * n_lon)

    # Flatten lat/lon grids
    lats = np.tile(ds["lat_scaler"].values.flatten(), n_time)
    lons = np.tile(ds["lon_scaler"].values.flatten(), n_time)

    # Flatten thickness
    thick_m = ds["heff"].values.reshape(-1)

    # Create DataFrame
    df = pd.DataFrame({
        "year": years,
        "month": months,
        "lat": lats,
        "lon": lons,
        "thick_m": thick_m
    })

    # Filter Arctic latitudes & non-null values
    df = df[(df["lat"] >= 65) & (~df["thick_m"].isna())]

    # Aggregate monthly mean
    if not df.empty:
        df_monthly = df.groupby(["year", "month"], as_index=False)["thick_m"].mean()
        all_data.append(df_monthly)
    else:
        print(f"No valid data after filtering in {file.name}")

In [None]:
# Combine and save
if all_data:
    df_final = pd.concat(all_data, ignore_index=True).sort_values(["year", "month"])
    output_file = BASE_DIR / "data" /"pre_processed" / "new_thickness_annual.csv"
    df_final.to_csv(output_file, index=False)
    print(f"\n Saved combined file: {output_file}")
else:
    print("\n No valid data found in any files — check variable names and filtering criteria.")


D:\Msc Data and Computational Science\Summer\Projects in Maths Modelling\Github\project-acm40960-ss
Looking for files in: D:\Msc Data and Computational Science\Summer\Projects in Maths Modelling\Github\project-acm40960-ss\data\raw\thickness
Found 44 files
Example file: heff.H1979.nc
Example file: heff.H1980.nc
Example file: heff.H1981.nc
Example file: heff.H1982.nc
Example file: heff.H1983.nc

Processing heff.H1979.nc...

Processing heff.H1980.nc...

Processing heff.H1981.nc...

Processing heff.H1982.nc...

Processing heff.H1983.nc...

Processing heff.H1984.nc...

Processing heff.H1985.nc...

Processing heff.H1986.nc...

Processing heff.H1987.nc...

Processing heff.H1988.nc...

Processing heff.H1989.nc...

Processing heff.H1990.nc...

Processing heff.H1991.nc...

Processing heff.H1992.nc...

Processing heff.H1993.nc...

Processing heff.H1994.nc...

Processing heff.H1995.nc...

Processing heff.H1996.nc...

Processing heff.H1997.nc...

Processing heff.H1998.nc...

Processing heff.H1999.n