In [2]:
# preprocess_tcwv_monsoon.py
import xarray as xr

def preprocess_tcwv(filepath):
    ds = xr.open_dataset(filepath)
    ds = ds.rename({'valid_time': 'time'}) if 'valid_time' in ds.dims else ds
    tcwv = ds['tcwv']  # assuming variable is 'tcwv'
    climatology = tcwv.mean(dim='time')
    anomalies = tcwv - climatology
    anomalies.name = 'tcwv_anomaly'
    anomalies.to_netcdf('tcwv_anomalies_monsoon.nc')

if __name__ == "__main__":
    preprocess_tcwv('total_column_water_vapour_monsoon.nc')


In [3]:
# eof_clustering_tcwv_monsoon.py
import xarray as xr
import numpy as np
from eofs.xarray import Eof
from sklearn.cluster import KMeans

def run_eof_clustering():
    data = xr.open_dataset('tcwv_anomalies_monsoon.nc')['tcwv_anomaly']
    data = data.rename({'valid_time': 'time'}) if 'valid_time' in data.dims else data

    solver = Eof(data)
    pcs = solver.pcs(npcs=7, pcscaling=1)
    np.save('pcs_tcwv_monsoon.npy', pcs.values)

    variance = solver.varianceFraction().values[:7]
    np.save('explained_variance_tcwv_monsoon.npy', variance)

    pcs_np = pcs.values
    pcs_norm = (pcs_np - pcs_np.mean(axis=0)) / pcs_np.std(axis=0)

    kmeans = KMeans(n_clusters=4, n_init=10, random_state=42).fit(pcs_norm)
    labels = kmeans.labels_

    regimes_ds = xr.Dataset({'regime': (['time'], labels)}, coords={'time': data['time']})
    regimes_ds.to_netcdf('eof_weather_regimes_tcwv_monsoon.nc')

    print("✅ EOF clustering (Monsoon TCWV) completed and saved.")

if __name__ == "__main__":
    run_eof_clustering()


✅ EOF clustering (Monsoon TCWV) completed and saved.


In [6]:
# plotting_tcwv_monsoon.py
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import seaborn as sns
import cartopy.crs as ccrs
import os
import pandas as pd
from scipy.interpolate import interp1d

def ensure_time(ds):
    if 'valid_time' in ds.dims:
        ds = ds.rename({'valid_time': 'time'})
    return ds

def plot_regime_frequency(ds):
    labels, counts = np.unique(ds['regime'].values, return_counts=True)
    plt.figure(figsize=(7,5))
    sns.barplot(x=labels, y=counts, palette="deep")
    plt.title("Weather Regime Frequency (Monsoon 2016–2021)", fontsize=14)
    plt.xlabel("Regime", fontsize=12)
    plt.ylabel("Days", fontsize=12)
    plt.tight_layout()
    plt.savefig('plots_monsoon/regime_frequency.png', dpi=300)
    plt.close()

def plot_spatial_composites(ds):
    for r in np.unique(ds['regime'].values):
        with xr.open_dataset('tcwv_anomalies_monsoon.nc') as anomalies:
            anomalies = ensure_time(anomalies)
            tcwv = anomalies['tcwv_anomaly']
            tcwv_regime = tcwv.where(ds['regime'] == r).mean(dim='time')

            plt.figure(figsize=(8,6))
            ax = plt.axes(projection=ccrs.PlateCarree())
            tcwv_regime.plot.contourf(ax=ax, transform=ccrs.PlateCarree(),
                                      levels=np.linspace(-5, 5, 21),
                                      cmap='BrBG', extend='both', add_colorbar=True)
            ax.coastlines()
            ax.set_title(f"Regime {r}: Mean TCWV Anomaly (Monsoon)", fontsize=14)
            plt.tight_layout()
            plt.savefig(f'plots_monsoon/regime_{r}_tcwv_composite.png', dpi=300)
            plt.close()

def plot_pc_timeseries(pcs, nao_index, enso_df):
    with xr.open_dataset('tcwv_anomalies_monsoon.nc') as anomalies:
        anomalies = ensure_time(anomalies)
        times = anomalies['time'].values

    explained_variance = np.load('explained_variance_tcwv_monsoon.npy')
    pc1_var = explained_variance[0] * 100

    enso_df['date'] = pd.to_datetime(enso_df[['Year', 'Month']].assign(day=15))
    enso_df = enso_df.set_index('date')
    enso_df_filtered = enso_df.loc["2016-06-01":"2021-09-30"]

    plt.figure(figsize=(10,5))
    plt.plot(times, pcs[:, 0], label='PC1 (EOF1)', linewidth=1.5)
    plt.plot(nao_index['time'].values, nao_index['nao'].values, label='NAO Index', linewidth=1.5)
    plt.plot(enso_df_filtered.index, enso_df_filtered['Anomaly'], label='ENSO Index (Monsoon 2016–2021)', linewidth=1.5)
    plt.legend()
    plt.title(f"PC1 vs NAO and ENSO Index (Monsoon 2016–2021)\nPC1 Explained Variance = {pc1_var:.2f}%", fontsize=14)
    plt.xlabel("Time", fontsize=12)
    plt.ylabel("Index Value", fontsize=12)
    plt.tight_layout()
    plt.savefig('plots_monsoon/pc1_vs_enso_nao.png', dpi=300)
    plt.close()

def plot_seasonal_cycle(ds):
    ds = ensure_time(ds)
    times = ds['time'].to_index()
    months = times.month
    regimes = ds['regime'].values
    plt.figure(figsize=(10,5))
    for r in np.unique(regimes):
        monthly_counts = [np.sum((months==m) & (regimes==r)) for m in range(1,13)]
        plt.plot(range(1,13), monthly_counts, label=f'Regime {r}')
    plt.xticks(range(1,13), ['Jan','Feb','Mar','Apr','May','Jun',
                             'Jul','Aug','Sep','Oct','Nov','Dec'])
    plt.legend()
    plt.title("Seasonal Cycle of Regime Occurrence (Monsoon Data)", fontsize=14)
    plt.xlabel("Month", fontsize=12)
    plt.ylabel("Days", fontsize=12)
    plt.tight_layout()
    plt.savefig('plots_monsoon/seasonal_cycle.png', dpi=300)
    plt.close()
    
def plot_pc_index_correlation(pcs, index_array, label, file_name, time_values=None, index_time=None):
    if time_values is not None and index_array.shape[0] != pcs.shape[0]:
        df = pd.DataFrame(pcs, columns=[f"PC{i+1}" for i in range(pcs.shape[1])])
        df['date'] = pd.to_datetime(time_values)
        df = df.set_index('date')

        index_df = pd.Series(index_array, index=index_time)

        # Interpolate ENSO/NAO index to PC dates
        index_interp = np.interp(
            df.index.astype(np.int64),
            index_df.index.astype(np.int64),
            index_df.values
        )

        correlations = [np.corrcoef(df.iloc[:, i], index_interp)[0, 1] for i in range(pcs.shape[1])]
    else:
        correlations = [np.corrcoef(pcs[:, i], index_array)[0, 1] for i in range(pcs.shape[1])]

    plt.figure(figsize=(8,5))
    sns.barplot(x=[f'PC{i+1}' for i in range(7)], y=correlations, palette="deep")
    plt.ylim(-1, 1)
    plt.title(f"Correlation between EOF PCs and {label} (Monsoon 2016–2021)", fontsize=14)
    plt.ylabel("Pearson Correlation", fontsize=12)
    plt.xlabel("Principal Components", fontsize=12)
    plt.tight_layout()
    plt.savefig(f'plots_monsoon/{file_name}.png', dpi=300)
    plt.close()



def plot_spatial_composites_new(ds):
    pcs = np.load('pcs_tcwv_monsoon.npy')
    explained_variance = np.load('explained_variance_tcwv_monsoon.npy')
    labels = ds['regime'].values
    explained_variance = explained_variance / explained_variance.sum()

    regime_variance_map = {}
    for r in np.unique(labels):
        regime_pcs = pcs[labels == r]
        pc_var = np.var(regime_pcs, axis=0)
        pc_var_normalized = pc_var / pc_var.sum()
        regime_variance = np.sum(pc_var_normalized * explained_variance)
        regime_variance_map[r] = regime_variance

    for r in np.unique(ds['regime'].values):
        with xr.open_dataset('tcwv_anomalies_monsoon.nc') as anomalies:
            anomalies = ensure_time(anomalies)
            tcwv = anomalies['tcwv_anomaly']
            tcwv_regime = tcwv.where(ds['regime'] == r).mean(dim='time')

            plt.figure(figsize=(8, 6))
            ax = plt.axes(projection=ccrs.PlateCarree())
            tcwv_regime.plot.contourf(ax=ax, transform=ccrs.PlateCarree(),
                                      levels=np.linspace(-5, 5, 21),
                                      cmap='BrBG', extend='both', add_colorbar=True)
            ax.coastlines()
            variance_str = f"{regime_variance_map[r]*100:.2f}%"
            ax.set_title(f"Regime {r}: Mean TCWV Anomaly (Monsoon)\n(Weighted EOF Variance ≈ {variance_str})", fontsize=13)
            plt.tight_layout()
            plt.savefig(f'plots_monsoon_rand/regime_{r}_tcwv_composite.png', dpi=300)
            plt.close()

def main():
    os.makedirs("plots_monsoon", exist_ok=True)
    os.makedirs("plots_monsoon_rand", exist_ok=True)

    with xr.open_dataset('eof_weather_regimes_tcwv_monsoon.nc') as ds:
        ds = ensure_time(ds)
        pcs = np.load('pcs_tcwv_monsoon.npy')

        with xr.open_dataset('nao_index.nc') as nao_index:
            enso_df = pd.read_csv('Enso_Monthwise_Index.csv')
            enso_df['date'] = pd.to_datetime(enso_df[['Year', 'Month']].assign(day=1)) + pd.offsets.MonthEnd(0)
            enso_df = enso_df.set_index('date').loc["2016-06-30":"2021-09-30"]
            enso_series = enso_df['Anomaly'].values

            anomalies = xr.open_dataset('tcwv_anomalies_monsoon.nc')
            anomalies = ensure_time(anomalies)
            time_values = anomalies['time'].values

            f_nao = interp1d(nao_index['time'].values.astype(np.int64),
                             nao_index['nao'].values, kind='linear', fill_value="extrapolate")
            nao_interp = f_nao(time_values.astype(np.int64))

            plot_regime_frequency(ds)
            plot_spatial_composites(ds)
            plot_pc_timeseries(pcs, nao_index, enso_df)
            plot_pc_index_correlation(pcs, nao_interp, "NAO Index", "pc_nao_correlation")
            plot_pc_index_correlation(
                pcs, enso_series, "ENSO Index", "pc_enso_correlation",
                time_values=time_values, index_time=enso_df.index
            )
            plot_seasonal_cycle(ds)
            plot_spatial_composites_new(ds)

if __name__ == "__main__":
    main()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=labels, y=counts, palette="deep")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=[f'PC{i+1}' for i in range(7)], y=correlations, palette="deep")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=[f'PC{i+1}' for i in range(7)], y=correlations, palette="deep")
