In [1]:
import xarray as xr
import numpy as np

def preprocess_wind(filepath):
    # Load the dataset
    ds = xr.open_dataset(filepath)
    
    # Extract u and v components at 200 hPa
    u_200 = ds['u'].sel(pressure_level=200)
    v_200 = ds['v'].sel(pressure_level=200) - ds['v'].sel(pressure_level=200)

    # Calculate wind speed (magnitude of wind)
    wind_speed_200 = np.sqrt(u_200**2 + v_200**2)

    # Calculate climatology (mean over time)
    wind_speed_clim = wind_speed_200.mean(dim='valid_time')
    
    # Calculate anomalies (subtract the climatology)
    wind_speed_anomalies = wind_speed_200 - wind_speed_clim

    # Save anomalies to NetCDF
    wind_anomalies = xr.Dataset({'wind_speed_anomalies': wind_speed_anomalies})
    wind_anomalies.to_netcdf('U_wind_speed_200hpa_anomalies.nc')

if __name__ == "__main__":
    preprocess_wind('India_Wind_200hpa.nc')


In [2]:
import numpy as np
import xarray as xr
from eofs.xarray import Eof
from sklearn.cluster import KMeans

def run_eof_clustering():
    # Load wind speed anomalies at 200 hPa
    data = xr.open_dataset('U_wind_speed_200hpa_anomalies.nc')['wind_speed_anomalies']

    # Fix dimension if needed
    if 'valid_time' in data.dims:
        data = data.rename({'valid_time': 'time'})

    # EOF analysis
    solver = Eof(data)
    pcs = solver.pcs(npcs=7, pcscaling=1)

    # Save PCs as numpy array for stable plotting
    np.save('U_pcs_wind_speed_200hpa.npy', pcs.values)

    # Save explained variance
    variance = solver.varianceFraction().values[:7]
    np.save('U_explained_variance_wind_speed_200hpa.npy', variance)

    # Normalize PCs for clustering
    pcs_np = pcs.values
    pcs_norm = (pcs_np - pcs_np.mean(axis=0)) / pcs_np.std(axis=0)

    # K-Means clustering
    kmeans = KMeans(n_clusters=4, n_init=10, random_state=42).fit(pcs_norm)
    labels = kmeans.labels_

    # Save regime labels as dataset
    regimes_ds = xr.Dataset({'regime': (['time'], labels)}, coords={'time': data['time']})
    regimes_ds.to_netcdf('U_eof_weather_regimes_wind_speed_200hpa.nc')

    print("✅ EOF clustering completed and outputs saved.")

if __name__ == "__main__":
    run_eof_clustering()


✅ EOF clustering completed and outputs saved.


In [3]:
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import seaborn as sns
import cartopy.crs as ccrs
import os
import pandas as pd
from scipy.interpolate import interp1d

def ensure_time(ds):
    """Ensure that the dataset has the correct time dimension."""
    if 'valid_time' in ds.dims:
        ds = ds.rename({'valid_time': 'time'})
    return ds

# Plot Regime Frequency
def plot_regime_frequency(ds):
    labels, counts = np.unique(ds['regime'].values, return_counts=True)
    plt.figure(figsize=(7,5))
    sns.barplot(x=labels, y=counts, palette="deep")
    plt.title("Wind Speed Regime Frequency (2016–2021)", fontsize=14)
    plt.xlabel("Regime", fontsize=12)
    plt.ylabel("Days", fontsize=12)
    plt.tight_layout()
    plt.savefig('plots_U_wind_200hpa/regime_frequency.png', dpi=300)
    plt.close()

# Plot Spatial Composites of Wind Speed Anomalies at 200 hPa
def plot_spatial_composites(ds):
    for r in np.unique(ds['regime'].values):
        with xr.open_dataset('U_wind_speed_200hpa_anomalies.nc') as anomalies:
            anomalies = ensure_time(anomalies)
            wind_speed = anomalies['wind_speed_anomalies']  # Use the correct variable here
            wind_speed_regime = wind_speed.where(ds['regime'] == r).mean(dim='time')

            plt.figure(figsize=(8,6))
            ax = plt.axes(projection=ccrs.PlateCarree())
            wind_speed_regime.plot.contourf(ax=ax, transform=ccrs.PlateCarree(),
                                           cmap='coolwarm', extend='both', add_colorbar=True)
            ax.coastlines()
            ax.set_title(f"Regime {r}: Mean Wind Speed Anomaly", fontsize=14)
            plt.tight_layout()
            plt.savefig(f'plots_U_wind_200hpa/regime_{r}_wind_speed_composite.png', dpi=300)
            plt.close()

# Plot PC1 Time Series with NAO and ENSO Indices
def plot_pc_timeseries(pcs, nao_index, enso_df):
    with xr.open_dataset('U_wind_speed_200hpa_anomalies.nc') as anomalies:
        anomalies = ensure_time(anomalies)
        times = anomalies['time'].values

    explained_variance = np.load('U_explained_variance_wind_speed_200hpa.npy')
    pc1_var = explained_variance[0] * 100

    # Prepare ENSO data
    enso_df['date'] = pd.to_datetime(enso_df[['Year', 'Month']].assign(day=15))
    enso_df = enso_df.set_index('date')
    enso_df_filtered = enso_df.loc["2016-01-01":"2021-12-31"]

    plt.figure(figsize=(10,5))
    plt.plot(times, pcs[:, 0], label='PC1 (EOF1)', linewidth=1.5)
    plt.plot(nao_index['time'].values, nao_index['nao'].values, label='NAO Index', linewidth=1.5)
    plt.plot(enso_df_filtered.index, enso_df_filtered['Anomaly'], label='ENSO Index (2016–2021)', linewidth=1.5)
    plt.legend()
    plt.title(f"PC1 vs NAO and ENSO Index (2016-2021)\nPC1 Explained Variance = {pc1_var:.2f}%", fontsize=14)
    plt.xlabel("Time", fontsize=12)
    plt.ylabel("Index Value", fontsize=12)
    plt.tight_layout()
    plt.savefig('plots_U_wind_200hpa/pc1_vs_enso_nao.png', dpi=300)
    plt.close()

# Plot Seasonal Cycle of Regime Occurrence
def plot_seasonal_cycle(ds):
    ds = ensure_time(ds)
    times = ds['time'].to_index()
    months = times.month
    regimes = ds['regime'].values
    plt.figure(figsize=(10,5))
    for r in np.unique(regimes):
        monthly_counts = [np.sum((months==m) & (regimes==r)) for m in range(1,13)]
        plt.plot(range(1,13), monthly_counts, label=f'Regime {r}')
    plt.xticks(range(1,13), ['Jan','Feb','Mar','Apr','May','Jun',
                             'Jul','Aug','Sep','Oct','Nov','Dec'])
    plt.legend()
    plt.title("Seasonal Cycle of Regime Occurrence", fontsize=14)
    plt.xlabel("Month", fontsize=12)
    plt.ylabel("Days", fontsize=12)
    plt.tight_layout()
    plt.savefig('plots_U_wind_200hpa/seasonal_cycle.png', dpi=300)
    plt.close()

# Plot Correlations Between PCs and External Indices
def plot_pc_index_correlation(pcs, index_array, label, file_name, time_values=None, index_time=None):
    if time_values is not None and index_array.shape[0] != pcs.shape[0]:
        df = pd.DataFrame(pcs, columns=[f"PC{i+1}" for i in range(pcs.shape[1])])
        df['date'] = pd.to_datetime(time_values)
        df = df.set_index('date')
        pcs_monthly = df.resample('M').mean()

        index_df = pd.Series(index_array, index=index_time)
        index_df = index_df.loc[pcs_monthly.index]

        correlations = [np.corrcoef(pcs_monthly.iloc[:, i], index_df)[0, 1] for i in range(pcs.shape[1])]
    else:
        correlations = [np.corrcoef(pcs[:, i], index_array)[0, 1] for i in range(pcs.shape[1])]

    plt.figure(figsize=(8,5))
    sns.barplot(x=[f'PC{i+1}' for i in range(7)], y=correlations, palette="deep")
    plt.ylim(-1, 1)
    plt.title(f"Correlation between EOF PCs and {label} (2016–2021)", fontsize=14)
    plt.ylabel("Pearson Correlation", fontsize=12)
    plt.xlabel("Principal Components", fontsize=12)
    plt.tight_layout()
    plt.savefig(f'plots_U_wind_200hpa/{file_name}.png', dpi=300)
    plt.close()

# Plot Spatial Composites with Weighted EOF Variance
def plot_spatial_composites_new(ds):
    pcs = np.load('U_pcs_wind_speed_200hpa.npy')
    explained_variance = np.load('U_explained_variance_wind_speed_200hpa.npy')
    labels = ds['regime'].values
    explained_variance = explained_variance / explained_variance.sum()

    regime_variance_map = {}
    for r in np.unique(labels):
        regime_pcs = pcs[labels == r]
        pc_var = np.var(regime_pcs, axis=0)
        pc_var_normalized = pc_var / pc_var.sum()
        regime_variance = np.sum(pc_var_normalized * explained_variance)
        regime_variance_map[r] = regime_variance

    for r in np.unique(ds['regime'].values):
        with xr.open_dataset('U_wind_speed_200hpa_anomalies.nc') as anomalies:
            anomalies = ensure_time(anomalies)
            wind_speed = anomalies['wind_speed_anomalies']  # Use the correct variable here
            wind_speed_regime = wind_speed.where(ds['regime'] == r).mean(dim='time')

            plt.figure(figsize=(8, 6))
            ax = plt.axes(projection=ccrs.PlateCarree())
            wind_speed_regime.plot.contourf(ax=ax, transform=ccrs.PlateCarree(),
                                           cmap='coolwarm', extend='both', add_colorbar=True)
            ax.coastlines()
            variance_str = f"{regime_variance_map[r]*100:.2f}%"
            ax.set_title(f"Regime {r}: Mean Wind Speed Anomaly\n(Weighted EOF Variance ≈ {variance_str})", fontsize=13)
            plt.tight_layout()
            plt.savefig(f'plots_U_wind_200hpa_rand/regime_{r}_wind_speed_composite.png', dpi=300)
            plt.close()

# Main Function to Generate All Plots
def main():
    os.makedirs("plots_U_wind_200hpa", exist_ok=True)
    os.makedirs("plots_U_wind_200hpa_rand", exist_ok=True)

    with xr.open_dataset('U_eof_weather_regimes_wind_speed_200hpa.nc') as ds:
        ds = ensure_time(ds)
        pcs = np.load('U_pcs_wind_speed_200hpa.npy')

        with xr.open_dataset('nao_index.nc') as nao_index:
            enso_df = pd.read_csv('Enso_Monthwise_Index.csv')
            enso_df['date'] = pd.to_datetime(enso_df[['Year', 'Month']].assign(day=1)) + pd.offsets.MonthEnd(0)
            enso_df = enso_df.set_index('date').loc["2016-01-31":"2021-12-31"]
            enso_series = enso_df['Anomaly'].values

            anomalies = xr.open_dataset('U_wind_speed_200hpa_anomalies.nc')
            anomalies = ensure_time(anomalies)
            time_values = anomalies['time'].values

            f_nao = interp1d(nao_index['time'].values.astype(np.int64),
                             nao_index['nao'].values, kind='linear', fill_value="extrapolate")
            nao_interp = f_nao(time_values.astype(np.int64))

            plot_regime_frequency(ds)
            plot_spatial_composites(ds)
            plot_pc_timeseries(pcs, nao_index, enso_df)
            plot_pc_index_correlation(pcs, nao_interp, "NAO Index", "pc_nao_correlation")
            plot_pc_index_correlation(
                pcs, enso_series, "ENSO Index", "pc_enso_correlation",
                time_values=time_values, index_time=enso_df.index
            )
            plot_seasonal_cycle(ds)
            plot_spatial_composites_new(ds)

if __name__ == "__main__":
    main()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=labels, y=counts, palette="deep")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=[f'PC{i+1}' for i in range(7)], y=correlations, palette="deep")
  pcs_monthly = df.resample('M').mean()

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=[f'PC{i+1}' for i in range(7)], y=correlations, palette="deep")
