# Develop climate clusters for the study sites

In [None]:
import pandas as pd
import geopandas as gpd
import os
import glob
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import xarray as xr
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings("ignore")
import sys

In [None]:
# Define path to data
scm_path = '/Volumes/LaCie/raineyaberle/Research/PhD/snow_cover_mapping/'

# Define output directory
out_path = os.path.join(scm_path, 'analysis')

# Define path to this repo
code_path = '/Users/raineyaberle/Research/PhD/snow_cover_mapping/glacier-snow-cover-analysis/'

# Import functions
sys.path.append(os.path.join(code_path, 'functions'))
import utils as f


## Compile glacier boundaries for all sites

In [None]:
# Define output file name
aois_fn = os.path.join(out_path, 'AOIs.gpkg')

# Check if file already exists
if os.path.exists(aois_fn):
    aois = gpd.read_file(aois_fn)
else:
    # Initialize GeoDataFrame
    aois = gpd.GeoDataFrame() 

    # Grab site names
    rgi_ids = [os.path.basename(folder) for folder in sorted(glob.glob(os.path.join(scm_path, 'study-sites', 'RGI*')))]
    
    # Iterate over site names
    for rgi_id in tqdm(rgi_ids):
        aoi_fn = os.path.join(scm_path, 'study-sites', rgi_id, 'AOIs', f'{rgi_id}_outline.shp')
        aoi = gpd.read_file(aoi_fn)
        aoi = aoi.to_crs('EPSG:4326')
        aois = pd.concat([aois, aoi], axis=0)
        
    # Add subregion names
    aois[['O1Region', 'O2Region']] = aois[['O1Region', 'O2Region']].astype(float)
    aois['Subregion'] = ''
    for o1, o2 in aois[['O1Region', 'O2Region']].drop_duplicates().values:
        subregion, _ = f.determine_subregion_name_color(o1, o2)
        aois.loc[(aois['O1Region']==o1) & (aois['O2Region']==o2), 'Subregion'] = subregion

    # Save to file
    aois.to_file(aois_fn, index=False)
    print('Compiled glacier boundaries saved to file:', aois_fn)
        
aois

## Calculate the mean of annual max. precipitation and temperature range

In [None]:
# Define output file name
mean_climate_fn = os.path.join(out_path, 'mean_climate.csv')

# Check if file already exists
if os.path.exists(mean_climate_fn):
    # Load from file
    mean_climate_df = pd.read_csv(mean_climate_fn)

else:
    # Initialize dataframe for results
    mean_climate_df = pd.DataFrame()
    
    # Iterate over site_names
    for rgi_id in tqdm(aois['RGIId'].drop_duplicates().values):
        # Load ERA CSV
        era_fn = os.path.join(scm_path, 'study-sites', rgi_id, 'ERA', f'{rgi_id}_ERA5-Land_daily_means.csv')
        if os.path.exists(era_fn):
            era = pd.read_csv(era_fn)
    
            # Add year column
            era['Date'] = pd.to_datetime(era['Date'])
            era['Year'] = era['Date'].dt.year
    
            # Calculate decadal mean for annual air temperature range and precip sums
            mean_max_precip = era.groupby(by='Year')['mean_total_precipitation_sum_wateryear_cumsum'].max().mean()
            mean_max_temp_range = (era.groupby(by='Year')['mean_temperature_2m_C_adj'].max() 
                                   - era.groupby(by='Year')['mean_temperature_2m_C_adj'].min()).mean()
            df = pd.DataFrame({'RGIId': [rgi_id], 
                               'mean_annual_precip_cumsum': [mean_max_precip],
                               'mean_annual_temp_range': [mean_max_temp_range]})
            mean_climate_df = pd.concat([mean_climate_df, df])
    mean_climate_df.reset_index(drop=True, inplace=True)

    # Save to file
    mean_climate_df.to_csv(mean_climate_fn, index=False)
    print('Mean climate conditions saved to file:', mean_climate_fn)

# Plot
fig, ax = plt.subplots()
ax.plot(mean_climate_df['mean_annual_precip_cumsum'], mean_climate_df['mean_annual_temp_range'], '.')
ax.grid()
ax.set_xlabel('Mean annual $\Sigma$(Precipitation) [m.w.e.]')
ax.set_ylabel('Mean annual air temperature range [$^o$C]')
plt.show()

## Standardize the input features

In [None]:
# identify columns to use for clustering
feature_cols = ['mean_annual_precip_cumsum', 'mean_annual_temp_range']

# normalize columns
X = mean_climate_df[feature_cols]
X_norm = X.copy(deep=True)
scaler = StandardScaler().fit(X)
X_norm[feature_cols] = scaler.transform(X)

# Plot
fig, ax = plt.subplots()
ax.plot(X_norm['mean_annual_precip_cumsum'], X_norm['mean_annual_temp_range'], '.')
ax.grid()
ax.set_xlabel('Scaled mean annual $\Sigma$(Precipitation)')
ax.set_ylabel('Scaled mean annual air temperature range [$^o$C]')
plt.show()

## Estimate the optimal number of clusters

In [None]:
# Initialize variables
K = np.arange(2,11)

# Iterate over number of clusters
sil_coefs = []
inertias = []
for k in K:
    model = KMeans(n_clusters=k).fit(X_norm)
    labels = model.predict(X_norm)
    sil_coefs.append(silhouette_score(X_norm, labels))
    inertias.append(model.inertia_)

# Plot the silhouette coefficients
inertia_color = 'm'
sil_color = 'b'
fig, ax = plt.subplots(1, 1, figsize=(6,4))
# silhouette coefficient
ax.plot(K, sil_coefs, '-', color=sil_color)
ax.set_xlabel('Number of clusters')
ax.set_ylabel('Silhouette score', color=sil_color)
# inertia
ax2 = ax.twinx()
ax2.plot(K, inertias, '-', color=inertia_color)
ax2.spines['right'].set_color(inertia_color)
ax2.set_ylabel('Inertia', color='m')

ax.grid()
plt.show()


## Cluster the results

In [None]:
# Define colors for clusters
n = 5
color_palette = sns.color_palette("hls", n)

# Fit clusterer to data
clusterer = KMeans(n_clusters=n)
clusterer = clusterer.fit(X_norm[feature_cols])
X['y_pred'] = clusterer.predict(X_norm[feature_cols])

# Add to results df 
X['y_pred'] = X['y_pred'] + 1 # (add 1 so first cluster = 1)
mean_climate_df['cluster'] = X['y_pred']

# Add cluster number to AOIs
aois['cluster'] = ''
for rgi_id in tqdm(mean_climate_df['RGIId'].drop_duplicates().values):
    aois.loc[aois['RGIId']==rgi_id, 'cluster'] = mean_climate_df.loc[mean_climate_df['RGIId']==rgi_id, 'cluster'].values[0]
aois = aois.loc[aois['cluster']!='']
aois.sort_values(by='cluster', inplace=True)

# Plot
fig, ax = plt.subplots(1, 2, figsize=(12,6))
sns.scatterplot(data=X, x='mean_annual_precip_cumsum', y='mean_annual_temp_range', hue='y_pred', 
                palette=color_palette, legend=False, ax=ax[0])
# ax[0].grid()
ax[0].set_xlabel('Mean annual $\Sigma$(Precipitation) [m.w.e.]')
ax[0].set_ylabel('Mean annual air temperature range [$^o$C]')
sns.scatterplot(data=aois, x='CenLon', y='CenLat', hue='cluster', 
                palette=color_palette, legend=True, ax=ax[1])
# ax[1].grid()
fig.tight_layout()
plt.show()

## Assign names to each cluster based on climate

In [None]:
# Initialize columns for cluster name
mean_climate_df['clustName'] = ''
aois['clustName'] = ''

# Iterate over cluster names
for i in sorted(mean_climate_df['cluster'].drop_duplicates().values):
    # Subset dataframe
    mean_climate_cluster_df = mean_climate_df.loc[mean_climate_df['cluster']==i]
    # Calculate mean precip. and air temp. range
    precip_mean = np.nanmean(mean_climate_cluster_df['mean_annual_precip_cumsum'])
    air_temp_range_mean = np.nanmean(mean_climate_cluster_df['mean_annual_temp_range'])
    # Determine cluster name
    if (precip_mean < 2) & (air_temp_range_mean < 35):
        cluster_name = 'W. Aleutians'
    elif (precip_mean < 2) & (air_temp_range_mean > 40):
        cluster_name = 'Continental'
    elif (precip_mean > 2.5) & (air_temp_range_mean < 40):
        cluster_name = 'Maritime'
    elif (precip_mean < 1.5) & (air_temp_range_mean > 30):
        cluster_name = 'Transitional-Continental'
    else:
        cluster_name = 'Transitional-Maritime'
    print(i, cluster_name)
    # Add to dataframes
    mean_climate_df.loc[mean_climate_df['cluster']==i, 'clustName'] = cluster_name
    aois.loc[aois['cluster']==i, 'clustName'] = cluster_name

# Save CSV and figure to file
mean_climate_fn = os.path.join(out_path, 'climate_clusters.csv')
mean_climate_df.to_csv(mean_climate_fn, index=False)
print('Results saved to file:', mean_climate_fn)

# Plot results
fig = plt.figure(figsize=(6,6))
sns.scatterplot(data=aois, x='CenLon', y='CenLat', hue='clustName')
plt.show()

## Make figure for the supplemental information

In [None]:
# Define color map
cluster_cmap_dict = {'W. Aleutians': '#dd3497', 
                     'Continental': '#a6611a',
                     'Transitional-Continental': '#dfc27d',
                     'Transitional-Maritime': '#80cdc1',
                     'Maritime': '#018571'}
cluster_order = ['W. Aleutians', 'Maritime', 'Transitional-Maritime', 'Transitional-Continental', 'Continental']

# Get limits for scaled data
fig, ax = plt.subplots()
sns.scatterplot(data=X_norm, x='mean_annual_temp_range', y='mean_annual_precip_cumsum')
xmin, xmax = ax.get_xlim()
ymin, ymax = ax.get_ylim()

# Set up figure
fontsize = 12
plt.rcParams.update({'font.size': fontsize, 'font.sans-serif': 'Arial'})
fig, ax = plt.subplots(1, 2, figsize=(14,6))

# Input features and scaled features
scat = sns.scatterplot(data=mean_climate_df, x='mean_annual_temp_range', y='mean_annual_precip_cumsum',
                       hue='clustName', hue_order=cluster_order, palette=cluster_cmap_dict, ax=ax[0])
ax[0].grid()
ax[0].legend().set_title('')
ax[0].set_xlabel('Air temperature range [$^{\circ}$C]')
ax[0].set_ylabel('Precipitation sum [m]')
# add secondary axes for scaler
ax_top = ax[0].twiny()
ax_right = ax[0].twinx()
ax_top.set_xlim(xmin, xmax)
ax_right.set_ylim(ymin, ymax)
ax_right.set_ylabel('Standardized air temperature range [unitless]', color='grey')
ax_top.set_xlabel('Standardized precipitation sum [unitless]', color='grey')
ax_right.spines['top'].set_color('grey')
ax_right.spines['right'].set_color('grey')
ax_top.set_xticklabels(ax_top.get_xticklabels(), color='grey')
ax_right.set_yticklabels(ax_right.get_yticklabels(), color='grey')
ax_top.tick_params(axis='x', colors='grey')
ax_right.tick_params(axis='y', colors='grey')

# Silhouette coefficient and intertia
inertia_color = '#b2182b'
sil_color = '#2166ac'
# silhouette coefficient
ax[1].plot(K, sil_coefs, '.-', color=sil_color)
Ibest = np.argwhere(sil_coefs==np.nanmax(sil_coefs))[0][0]
ax[1].plot(K[Ibest], sil_coefs[Ibest], '*', color=sil_color, markersize=15)
ax[1].set_xlabel('Number of clusters')
ax[1].set_ylabel('Silhouette score', color=sil_color)
ax[1].grid()
ax[1].tick_params(axis='y', color=sil_color)
ax[1].set_yticklabels(ax[1].get_yticklabels(), color=sil_color)

# inertia
ax2 = ax[1].twinx()
ax2.plot(K, inertias, '.-', color=inertia_color)
Ibest = 3
ax2.plot(K[Ibest], inertias[Ibest], '*', color=inertia_color, markersize=15)
ax2.spines['right'].set_color(inertia_color)
ax2.spines['left'].set_color(sil_color)
ax2.set_ylabel('Inertia', color=inertia_color)
ax2.tick_params(axis='y', color=inertia_color)
ax2.set_yticklabels(ax2.get_yticklabels(), color=inertia_color)
ax2.text(4.8, 68, 'elbow', color=inertia_color, rotation=-35, fontsize=14)

# Add panel labels
labels = ['a', 'b']
for i, axis in enumerate(ax):
    axis.text(axis.get_xlim()[0] + (axis.get_xlim()[1] - axis.get_xlim()[0])*0.05,
              axis.get_ylim()[0] + (axis.get_ylim()[1] - axis.get_ylim()[0])*0.05,
              labels[i], fontweight='bold', fontsize=fontsize+4)

fig.tight_layout()
plt.show()

fig_fn = os.path.join(code_path, 'figures', 'figS1_kmeans.png')
fig.savefig(fig_fn, dpi=300, bbox_inches='tight')
print('Figure saved to file:', fig_fn)

## Apply Koppen-Geiger climate zone classification to sites 

### _Too broad, almost all sites in one category_

From Beck et al. (2023): https://doi.org/10.1038/s41597-023-02549-6

In [None]:
# -----Load Koppen-Geiger climate zones
kg_fn = os.path.join(scm_path, 'koppen_geiger_nc', '1991_2020', 'koppen_geiger_0p01.nc')
kg = xr.open_dataset(kg_fn)
kg = xr.where(kg==0, np.nan, kg) # set no data values to NaN

# -----Sample KG at each AOI
# add centroid column to aois
aois = aois.to_crs('EPSG:4326')
aois['centroid'] = [x.centroid for x in aois['geometry']]
aois['centroid_lon'] = [x.coords.xy[0][0] for x in aois['centroid']]
aois['centroid_lat'] = [x.coords.xy[1][0] for x in aois['centroid']]
# sample KG at each centroid point
aois['kg_class'] = [int(kg.sel(lon=lon, lat=lat, method='nearest').kg_class.data) 
                    for (lon, lat) in list(zip(aois['centroid_lon'].values, aois['centroid_lat'].values))]

# -----Add KG columns to AOIs 
kg_dict = {19: {'name': 'Dsc: Cold, dry summer, cold summer',
                'color': '#969696'},
           27: {'name': 'Dfc: Cold, no dry season, cold summer',
                'color': '#007d7d'},
           29: {'name': 'ET: Polar, tundra',
                'color': '#b2b2b2'},
           30: {'name': 'EF: Polar, frost',
                'color': '#666666'}
          }
aois[['kg_class_name', 'kg_class_color']] = '', ''
for kg_class in [19, 27, 29, 30]:
    aois.loc[aois['kg_class']==kg_class, 'kg_class_name'] = kg_dict[kg_class]['name']
    aois.loc[aois['kg_class']==kg_class, 'kg_class_color'] = kg_dict[kg_class]['color']
aois


In [None]:
# -----Plot
plt.rcParams.update({'font.sans-serif':'Arial', 'font.size':12})
fig, ax = plt.subplots(2, 1, figsize=(8, 10), gridspec_kw={'height_ratios':[3,1]})
# map plot
sns.scatterplot(ax=ax[0], data=aois, x='centroid_lon', y='centroid_lat', 
                hue='kg_class', s=10, palette='tab10',  legend=True)
ax[0].grid()
ax[0].set_xlabel('')
ax[0].set_ylabel('')
# histogram of counts per class
counts = aois['kg_class'].value_counts().sort_values()
ax[1].bar(counts.index, counts.values, width=1)
for i in range(0,len(counts)):
    ax[1].text(counts.index[i]-0.25, counts.values[i] + 2, str(counts.values[i]))
ax[1].set_xlabel('KG class')
ax[1].set_ylabel('Count')
ax[1].set_xticks(counts.index)
plt.show()