In [1]:
import os
import glob
import numpy as np
import pandas as pd
import xarray as xr

In [2]:
seass_files = sorted(glob.glob(f'data/SEAS5/ecmwf_year-*_month-{1:02d}_sfc.nc'))
print(seass_files[:5])

['data/SEAS5\\ecmwf_year-1993_month-01_sfc.nc', 'data/SEAS5\\ecmwf_year-1994_month-01_sfc.nc', 'data/SEAS5\\ecmwf_year-1995_month-01_sfc.nc', 'data/SEAS5\\ecmwf_year-1996_month-01_sfc.nc', 'data/SEAS5\\ecmwf_year-1997_month-01_sfc.nc']


In [3]:
# --------------------------
# User-defined parameters
# --------------------------
vars_to_process = ["T2MAX", "T2MIN"]       # Variables of interest
lat_min, lat_max = 36, 49                  # Latitude bounds for Midwest
lon_min, lon_max = -105, -84               # Longitude bounds for Midwest
lead_times = range(0, 6)                   # Lead times: 0 to 5 months
weighted = False                            # Whether to apply cosine-latitude weighting

In [4]:
all_dfs = []  # List to store results from all files

# --------------------------
# Loop over forecast start months (January to September)
# --------------------------
for start_month in range(1, 10):
    
    # Find all SEAS5 files corresponding to this start month
    seass_files = sorted(glob.glob(f'data/SEAS5/ecmwf_year-*_month-{start_month:02d}_sfc.nc'))
    
    # Loop over each file
    for f in seass_files:
        ds = xr.open_dataset(f)
        
        # Apply Midwest spatial mask
        mask = ((ds['lat'] >= lat_min) & (ds['lat'] <= lat_max) &
                (ds['lon'] >= lon_min) & (ds['lon'] <= lon_max))
        ds_masked = ds.where(mask, drop=True)
        
        # Loop over lead times
        for lt in lead_times:
            ds_subset = ds_masked.isel(time=lt)
            
            # Initialize dictionary to store results for this lead time
            data_dict = {
                'file': os.path.basename(f),      # Filename for reference
                'start_month': start_month,       # Start month of forecast
                'lead_time': lt,                  # Lead time index
                'time': ds_subset['time'].values,
                'number': ds_subset['number'].values  # Ensemble member numbers
            }
            
            # Loop over each variable of interest
            for var in vars_to_process:
                if weighted:
                    # Cosine-latitude weighted mean (accounts for latitude-area differences)
                    weights = np.cos(np.deg2rad(ds_subset['lat']))
                    midwest_mean = (
                        (ds_subset[var] * weights)
                        .sum(dim=['south_north','west_east']) /
                        weights.sum(dim=['south_north','west_east'])
                    )
                else:
                    # Simple arithmetic mean over the Midwest domain
                    midwest_mean = ds_subset[var].mean(dim=['south_north','west_east'])
                
                # Store variable mean in dictionary
                data_dict[var] = midwest_mean.values
            
            # Convert dictionary to DataFrame (one row per ensemble member)
            df = pd.DataFrame(data_dict)
            all_dfs.append(df)

# --------------------------
# Combine all DataFrames into a single DataFrame
# --------------------------
final_df = pd.concat(all_dfs, ignore_index=True)

# Reorder columns to put 'file' and 'start_month' first
cols = final_df.columns.tolist()
for c in ['file', 'start_month']:
    cols.remove(c)
final_df = final_df[['file', 'start_month'] + cols]

# Save final combined DataFrame to CSV
final_df.to_csv('output/SEASS_midwest_T2.csv', index=False)

# Preview results
print(final_df.head())


                              file  start_month  lead_time       time  number  \
0  ecmwf_year-1993_month-01_sfc.nc            1          0 1993-02-01       0   
1  ecmwf_year-1993_month-01_sfc.nc            1          0 1993-02-01       1   
2  ecmwf_year-1993_month-01_sfc.nc            1          0 1993-02-01       2   
3  ecmwf_year-1993_month-01_sfc.nc            1          0 1993-02-01       3   
4  ecmwf_year-1993_month-01_sfc.nc            1          0 1993-02-01       4   

        T2MAX       T2MIN  
0  271.899719  261.691162  
1  269.856476  260.007996  
2  272.731934  263.242004  
3  268.732452  259.824219  
4  268.753479  258.570068  


In [5]:
# =======================
# File lists
# =======================
t2max_files = sorted(glob.glob('data/OBS/OBS_monthly_T2MAX_*.nc'))
t2min_files = sorted(glob.glob('data/OBS/OBS_monthly_T2MIN_*.nc'))

# =======================
# Initialize list for DataFrames
# =======================
all_dfs = []

# =======================
# Loop over matched T2MAX/T2MIN files
# =======================
for f_max, f_min in zip(t2max_files, t2min_files):
    # Open datasets
    ds_max = xr.open_dataset(f_max)
    ds_min = xr.open_dataset(f_min)
    
    # -----------------------
    # Apply Midwest spatial mask
    # -----------------------
    mask_max = ((ds_max['lat'] >= lat_min) & (ds_max['lat'] <= lat_max) &
                (ds_max['lon'] >= lon_min) & (ds_max['lon'] <= lon_max))
    mask_min = ((ds_min['lat'] >= lat_min) & (ds_min['lat'] <= lat_max) &
                (ds_min['lon'] >= lon_min) & (ds_min['lon'] <= lon_max))
    
    ds_max_masked = ds_max.where(mask_max, drop=True)
    ds_min_masked = ds_min.where(mask_min, drop=True)
    
    # -----------------------
    # Compute regional mean
    # -----------------------
    if weighted:
        # Cosine-latitude weighting
        weights_max = np.cos(np.deg2rad(ds_max_masked['lat']))
        weights_min = np.cos(np.deg2rad(ds_min_masked['lat']))
        
        t2max_mean = ((ds_max_masked['T2MAX'] * weights_max)
                      .sum(dim=['south_north','west_east']) /
                      weights_max.sum(dim=['south_north','west_east']))
        t2min_mean = ((ds_min_masked['T2MIN'] * weights_min)
                      .sum(dim=['south_north','west_east']) /
                      weights_min.sum(dim=['south_north','west_east']))
    else:
        # Simple arithmetic mean
        t2max_mean = ds_max_masked['T2MAX'].mean(dim=['south_north','west_east'])
        t2min_mean = ds_min_masked['T2MIN'].mean(dim=['south_north','west_east'])
    
    # -----------------------
    # Convert to DataFrame
    # -----------------------
    df = pd.DataFrame({
        'file': os.path.basename(f_max),           # Filename reference
        'time': ds_max_masked['time'].values,      # Time values
        'T2MAX': t2max_mean.values,                # Regional mean T2MAX
        'T2MIN': t2min_mean.values                 # Regional mean T2MIN
    })
    
    all_dfs.append(df)

# =======================
# Combine all years into one DataFrame
# =======================
final_df = pd.concat(all_dfs, ignore_index=True)

# =======================
# Preview results
# =======================
print(final_df.head())

# =======================
# Save to CSV
# =======================
final_df.to_csv('output/OBS_midwest_T2.csv', index=False)


                                               file       time       T2MAX  \
0  OBS_monthly_T2MAX_1980-01-01-00_1980-12-31-18.nc 1980-01-31  252.042851   
1  OBS_monthly_T2MAX_1980-01-01-00_1980-12-31-18.nc 1980-02-29  253.053187   
2  OBS_monthly_T2MAX_1980-01-01-00_1980-12-31-18.nc 1980-03-31  258.893744   
3  OBS_monthly_T2MAX_1980-01-01-00_1980-12-31-18.nc 1980-04-30  268.373861   
4  OBS_monthly_T2MAX_1980-01-01-00_1980-12-31-18.nc 1980-05-31  274.378134   

        T2MIN  
0  241.870574  
1  242.252095  
2  247.255262  
3  254.732599  
4  260.302375  
