In [20]:
import xarray as xr
import numpy as np
import pandas as pd
from pathlib import Path

In [21]:
# File Path
BASE_DIR = Path().resolve().parents[1]
output_path = BASE_DIR / "data" / "processed" / "era5_arctic.csv"

# Load Arctic gridded temperature NetCDF file
arctic_path_1 = BASE_DIR / "data" / "raw" / "data_stream-moda_stepType-avgad.nc"
arctic_path_2 = BASE_DIR / "data" / "raw" / "data_stream-moda_stepType-avgua.nc"
ds_1 = xr.open_dataset(arctic_path_1, decode_times= True)
ds_2 = xr.open_dataset(arctic_path_2, decode_times= True)

In [22]:
print(ds_1)

<xarray.Dataset> Size: 2GB
Dimensions:     (valid_time: 1027, latitude: 95, longitude: 1440)
Coordinates:
    number      int64 8B ...
  * valid_time  (valid_time) datetime64[ns] 8kB 1940-01-01T06:00:00 ... 2025-...
  * latitude    (latitude) float64 760B 90.0 89.75 89.5 ... 67.0 66.75 66.5
  * longitude   (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8
    expver      (valid_time) <U4 16kB ...
Data variables:
    cdir        (valid_time, latitude, longitude) float32 562MB ...
    uvb         (valid_time, latitude, longitude) float32 562MB ...
    slhf        (valid_time, latitude, longitude) float32 562MB ...
    sf          (valid_time, latitude, longitude) float32 562MB ...
Attributes:
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:               

In [23]:
print(ds_2)

<xarray.Dataset> Size: 6GB
Dimensions:     (valid_time: 1027, latitude: 95, longitude: 1440)
Coordinates:
    number      int64 8B ...
  * valid_time  (valid_time) datetime64[ns] 8kB 1940-01-01 ... 2025-07-01
  * latitude    (latitude) float64 760B 90.0 89.75 89.5 ... 67.0 66.75 66.5
  * longitude   (longitude) float64 12kB -180.0 -179.8 -179.5 ... 179.5 179.8
    expver      (valid_time) <U4 16kB ...
Data variables:
    t2m         (valid_time, latitude, longitude) float32 562MB ...
    sst         (valid_time, latitude, longitude) float32 562MB ...
    istl1       (valid_time, latitude, longitude) float32 562MB ...
    istl2       (valid_time, latitude, longitude) float32 562MB ...
    istl3       (valid_time, latitude, longitude) float32 562MB ...
    istl4       (valid_time, latitude, longitude) float32 562MB ...
    skt         (valid_time, latitude, longitude) float32 562MB ...
    ishf        (valid_time, latitude, longitude) float32 562MB ...
    tcc         (valid_time, latitu

In [24]:
# Drop extra coords
ds_1 = ds_1.drop_vars(["number", "expver"], errors="ignore")
ds_2 = ds_2.drop_vars(["number", "expver"], errors="ignore")

# Average over spatial dims (lat/lon) if they exist
if {"latitude", "longitude"}.issubset(ds_1.dims):
    ds_1 = ds_1.mean(dim=["latitude", "longitude"], skipna=True)
if {"latitude", "longitude"}.issubset(ds_2.dims):
    ds_2 = ds_2.mean(dim=["latitude", "longitude"], skipna=True)

In [25]:
# Merge on time
ds_combined = xr.merge([ds_1, ds_2])

# Convert to DataFrame
df = ds_combined.to_dataframe().reset_index()

# Extract year/month
df["year"] = df["valid_time"].dt.year
df["month"] = df["valid_time"].dt.month

# Drop 'time' column if not needed
df = df.drop(columns=["valid_time"], errors="ignore")

In [26]:
# Save
df.to_csv(output_path, index=False)

print(df.head())

            cdir           uvb         slhf        sf         t2m         sst  \
0            NaN           NaN          NaN       NaN  248.543884  271.808777   
1   10817.826172   2392.978271 -567206.2500  0.000570         NaN         NaN   
2            NaN           NaN          NaN       NaN  247.050034  271.761047   
3  376952.531250  56916.292969 -594960.5625  0.000517         NaN         NaN   
4            NaN           NaN          NaN       NaN  247.339096  271.748749   

        istl1       istl2       istl3       istl4         skt      ishf  \
0  255.572723  256.910950  261.280609  266.890137  247.700409  8.020420   
1         NaN         NaN         NaN         NaN         NaN       NaN   
2  253.778244  255.169205  259.894867  266.239319  246.275787  5.975832   
3         NaN         NaN         NaN         NaN         NaN       NaN   
4  253.694855  255.157516  259.997375  266.300415  246.610519  4.191311   

        tcc         tsn  year  month  
0  0.791489  248.192078

In [30]:
import pandas as pd

# Read as plain text
with open(output_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Extract header
header = lines[0].strip().split(",")

# Prepare merged rows
merged_rows = []

# Go through file two lines at a time
for i in range(1, len(lines), 2):
    part2 = lines[i].strip().split(",")   # This is the ",,,," part
    part1 = lines[i+1].strip().split(",") # This is the "cdir, uvb..." part

    # Fill empty values from part2 into part1
    merged = part1.copy()
    for col_i in range(len(header)):
        if merged[col_i] == "" or merged[col_i] == " ":
            merged[col_i] = part2[col_i]

    merged_rows.append(merged)

# Convert to DataFrame
df = pd.DataFrame(merged_rows, columns=header)

# Save cleaned CSV
output_path_2 = BASE_DIR / "data" / "processed" / "era5_arctic_merged_clean.csv"
df.to_csv(output_path_2, index=False)

# Display first few rows
df.head()


Unnamed: 0,cdir,uvb,slhf,sf,t2m,sst,istl1,istl2,istl3,istl4,skt,ishf,tcc,tsn,year,month
0,10817.826,2392.9783,-567206.25,0.0005697636,248.54388,271.80878,255.57272,256.91095,261.2806,266.89014,247.70041,8.02042,0.79148906,248.19208,1940,1
1,376952.53,56916.293,-594960.56,0.00051717146,247.05003,271.76105,253.77824,255.1692,259.89487,266.23932,246.27579,5.975832,0.78240937,246.77284,1940,2
2,3024464.2,420626.47,-587546.0,0.0003937841,247.3391,271.74875,253.69485,255.15752,259.99738,266.3004,246.61052,4.1913114,0.75285816,247.10875,1940,3
3,10949862.0,1486365.4,-684425.4,0.0005391588,257.493,271.76178,261.07193,261.6873,264.0063,267.97226,257.3393,-0.075946406,0.81511086,257.3759,1940,4
4,20399918.0,2696147.2,-1042911.06,0.0005264869,266.8072,271.80893,267.99844,268.1052,268.69125,270.12183,266.7508,-3.0038044,0.83328825,266.52252,1940,5


In [29]:
df.dtypes

cdir     object
uvb      object
slhf     object
sf       object
t2m      object
sst      object
istl1    object
istl2    object
istl3    object
istl4    object
skt      object
ishf     object
tcc      object
tsn      object
year     object
month    object
dtype: object