In [None]:
import os
import xarray as xr
import zipfile
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.signal import correlate
from statsmodels.tsa.stattools import adfuller, coint
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.ar_model import AutoReg


In [None]:
data_dir = '/Users/avalottig/Desktop/Computer-Science-MSc/ADIA-Project/Code/MScProject/data/ERA5/2020-2021'
months = ['January2020','February2020', 'March2020', 'April2020', 'May2020', 'June2020', 'July2020', 'August2020', 'September2020', 'October2020', 'November2020', 'December2020','January2021','February2021', 'March2021', 'April2021', 'May2021', 'June2021', 'July2021', 'August2021', 'September2021', 'October2021', 'November2021', 'December2021','January2022','February2022', 'March2022', 'April2022', 'May2022', 'June2022', 'July2022', 'August2022', 'September2022', 'October2022', 'November2022', 'December2022','January2023','February2023', 'March2023', 'April2023', 'May2023', 'June2023', 'July2023', 'August2023', 'September2023', 'October2023', 'November2023', 'December2023']
precip_all = []
lat,lon = 41.948936, -93.687760
#---------------------------------------------------------------------------------------------------------------------------
# code adapted from: https://notebook.community/alaindomissy/xarray_example/Exploring%20netCDF%20Datasets%20Using%20xarray 
for month in months:
    zip_path = os.path.join(data_dir, month, [f for f in os.listdir(os.path.join(data_dir, month)) if f.endswith('.zip')][0])
    netcdf_name = 'data.nc'
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extract(netcdf_name)
    
    nc_path = os.path.join(os.getcwd(), netcdf_name)
    
    try:
        ds = xr.open_dataset(nc_path, engine='scipy')
        print(f"success: {month}")
    except Exception as e:
        print(f"error opening for{month}: {e}")
        continue
    
    precip = ds['tp'].sel(latitude=41.95, longitude=-93.69, method='nearest').to_dataframe()
    precip_all.append(precip)
    os.remove(nc_path)
#---------------------------------------------------------------------------------------------------------------------------
precip_time_series = pd.concat(precip_all)
precip_time_series.index = pd.to_datetime(precip_time_series.index)
#resample to same res as sm time series
precip_agg = precip_time_series['tp'].resample('3H').sum()
file_path = '/Users/avalottig/Desktop/Computer-Science-MSc/ADIA-Project/Code/MScProject/data/SPL4SMAU_2020/smap_iowa_2020_2024_surface.csv'
sm = pd.read_csv(file_path, parse_dates=['date'])
sm.set_index('date', inplace=True)
sm['soil_moisture'] = pd.to_numeric(sm['soil_moisture'])
ldata = sm[(sm['latitude'] == lat) & (sm['longitude'] == lon)]

plt.figure(figsize=(12, 6))
plt.plot(ldata.index, ldata['soil_moisture'], label='Soil Moisture', color='green')
plt.plot(precip_agg.index, precip_agg, label='Total Precipitation', color='blue')

plt.title('Soil Moisture and Total Precipitation for 2020-2024 (41.948936, -93.687760)')
plt.xlabel('Date')
plt.ylabel('Value (m and m^3/m^3)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
monthly_sm = ldata['soil_moisture'].resample('ME').mean()
monthly_p = precip_agg.resample('ME').sum()
monthly = pd.merge(monthly_sm, monthly_p, left_index=True, right_index=True)

In [None]:
weekly_p= precip_agg.resample('W').sum()
daily_p = precip_agg.resample('D').sum()

plt.plot(weekly_p.index, weekly_p, label='Weekly Precipitation', color='blue')
plt.plot(daily_p.index, daily_p, label='Daily Precipitation', color='red')
plt.plot(monthly_p.index, monthly_p, label='Monthly Precipitation', color='green')
plt.title('Precipitation Resampling')
plt.xlabel('Date')
plt.ylabel('Precipitation')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


Augmented Dickey Fuller Test

In [None]:
print(precip_agg.describe())
twelve = precip_time_series['tp'].resample('12h').sum()
adf_tp = adfuller(twelve, autolag='AIC')
for key, value in adf_tp[4].items():
    print('criticsl:')
    print(f' {key}, {value}') 

print('ADF :', adf_tp[0])
print('p-value:', adf_tp[1])

In [None]:
import matplotlib.pyplot as plt
from pandas.plotting import autocorrelation_plot

plt.figure(figsize=(14, 7))
autocorrelation_plot(ldata['soil_moisture'].diff().dropna())
plt.title('Autocorrelation of Soil Moisture')
plt.show()
plt.figure(figsize=(14, 7))
autocorrelation_plot(precip_time_series['tp'].dropna())
plt.title('Autocorrelation of Precipitation')
plt.show()

In [None]:
plt.figure(figsize=(14, 7))
plot_acf(ldata['soil_moisture'].diff().dropna(), lags=50)
plt.title('Autocorrelation of Soil Moisture')
plt.show()

plt.figure(figsize=(14, 7))
plot_acf(precip_time_series['tp'].dropna(), lags=50)
plt.title('Autocorrelation of Precipitation')
plt.show()
plot_acf(precip_time_series['tp'], lags=1000)
plt.title('ACF of Hourly Precipitation')
plt.show()

plot_pacf(precip_time_series['tp'], lags=1000)
plt.title('PACF of Hourly Precipitation')
plt.show()
precip_daily =precip_time_series['tp'].resample('D').sum()
plot_acf(precip_daily, lags = 100)
plt.title('ACF of DailyPrecipitation')
plt.show()

plot_pacf(precip_daily)
plt.title('PACF of Daily Precipitation')
plt.show()



In [None]:
import pandas as pd
import numpy as np

def adf_test(y, max_lag):
    aics = []
    bics = []
    for lag in range(1, max_lag + 1):
        model = AutoReg(y, lags=lag).fit()
        aics.append(model.aic)
        bics.append(model.bic)
    return aics, bics

aic_values, bic_values = adf_test(precip_time_series['tp'], 10)
best_aic = np.argmin(aic_values) + 1
best_bic = np.argmin(bic_values) + 1

print(f'Optimal lag length based on AIC: {best_aic}')
print(f'Optimal lag length based on BIC: {best_bic}')
adf_aic = adfuller(precip_time_series['tp'], maxlag=best_aic, autolag=None)
adf_bic = adfuller(precip_time_series['tp'], maxlag=best_bic, autolag=None)
print(f'ADF(AIC): {adf_aic[0]}, p-value (AIC): {adf_aic[1]}, ADF (BIC): {adf_bic[0]}, p-value (BIC): {adf_bic[1]}')


In [None]:
plt.plot(precip_time_series['tp'])
plt.title('Precipitation Time Series')
plt.show()

In [None]:
yearly_totals = precip_time_series['tp'].resample('Y').sum()
print("Yearly Totals of Precipitation:")
print(yearly_totals)

plt.figure(figsize=(10, 6))
plt.plot(yearly_totals.index, yearly_totals)
plt.title('Yearly Totals of Precipitation')
plt.xlabel('Year')
plt.ylabel('Total Precipitation')
plt.grid(True)
plt.show()


In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
multiplicative_decomp = seasonal_decompose(precip_time_series['tp'], model='additive', period=8760)

multiplicative_decomp.plot()
plt.suptitle(f'Multiplicative Decomposition')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
multiplicative_decomp = seasonal_decompose(ldata['soil_moisture'], model='additive', period=4)
multiplicative_decomp.plot()
plt.suptitle(f'Multiplicative Decomposition')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
data_diff = precip_time_series['tp'].diff().dropna()
plt.plot(data_diff)
plt.title('Differenced Precipitation')
plt.show()

In [None]:
from scipy.signal import find_peaks

soil_moisture_peaks, _ = find_peaks(ldata['soil_moisture'], distance=8)  
precip_peaks, _ = find_peaks(precip_agg, distance=8)      


lags = []
for sm_peak in soil_moisture_peaks:
    sm_peak_time = ldata.index[sm_peak]
    closest_precip_peak_time = precip_agg.index[precip_peaks][(abs(precip_agg.index[precip_peaks] - sm_peak_time)).argmin()]
    lag = (sm_peak_time - closest_precip_peak_time).total_seconds() / 3600  
    lags.append(lag)


print("Lags between soil moisture peaks and precipitation events (in hours):")
print(lags)

lag_times = np.array(lags)
lag_range = np.ptp(lags)
lag_median = np.median(lags)
lag_mean = np.mean(lags)
lag_min = np.min(lags)
lag_max = np.max(lags)


print("Lags between soil moisture peaks and precipitation events (in hours):")
print(f"Range: {lag_range:.2f} hours")
print(f"Median: {lag_median:.2f} hours")
print(f"Mean: {lag_mean:.2f} hours")
print(f"Min: {lag_min:.2f} hours")
print(f"Max: {lag_max:.2f} hours")
plt.figure(figsize=(12, 6))
plt.plot(ldata.index, ldata['soil_moisture'], label='Soil Moisture', color='green')
plt.plot(precip_agg.index, precip_agg, label='Total Precipitation', color='blue')
plt.scatter(ldata.index[soil_moisture_peaks], ldata['soil_moisture'].iloc[soil_moisture_peaks], color='red')
plt.scatter(precip_agg.index[precip_peaks], precip_agg.iloc[precip_peaks], color='orange')
plt.title('Soil Moisture and Total Precipitation for 2020 at Location (41.948936, -93.687760)')
plt.xlabel('Date')
plt.ylabel('Value (m and m^3/m^3)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
soil_moisture_daily, precipitation_daily = sm.resample('D').mean(), precip_time_series.resample('D').sum()
aligned_sm, aligned_p = soil_moisture_daily.align(precipitation_daily, join='inner')
aligned_soil_moisture,aligned_precipitation = aligned_sm.dropna(), aligned_p.dropna()

In [None]:
# Detrend 
detrended_soil_moisture = detrend(aligned_soil_moisture)
detrended_precipitation = detrend(aligned_precipitation)

# Normalise
normal_soil_moisture = (detrended_soil_moisture - np.mean(detrended_soil_moisture)) / np.std(detrended_soil_moisture)
normal_precipitation = (detrended_precipitation - np.mean(detrended_precipitation)) / np.std(detrended_precipitation)


In [None]:
def butter_highpass(cutoff, fs, order=5):
    nyquist = 0.5 * fs
    normal_cutoff = cutoff / nyquist
    b, a = butter(order, normal_cutoff, btype='high', analog=False)
    return b, a

def highpass_filter(data, cutoff, fs, order=5):
    b, a = butter_highpass(cutoff, fs, order=order)
    y = filtfilt(b, a, data)
    return y


cutoff_frequency = 1 / 100  # filter for trends longer than 100 days
fs = 1  # frequency is 1 day

filtered_soil_moisture = highpass_filter(normal_soil_moisture, cutoff_frequency, fs)
filtered_precipitation = highpass_filter(normal_precipitation, cutoff_frequency, fs)


In [None]:
from scipy.signal import welch, coherence, csd

# PSD
frequencies, soil_psd = welch(filtered_soil_moisture, fs=fs)
_, precip_psd = welch(filtered_precipitation, fs=fs)

# CSD
frequencies, cpsd = csd(filtered_soil_moisture, filtered_precipitation, fs=fs)

# Coherence
frequencies, coherence_values = coherence(filtered_soil_moisture, filtered_precipitation, fs=fs)
frequencies, cpsd = csd(filtered_soil_moisture, filtered_precipitation, fs=fs)

# pHASe Diff
phase_difference = np.angle(cpsd)  # radians


plt.plot(frequencies, np.degrees(phase_difference))  
plt.title('Phase Difference between Soil Moisture and Precipitation')
plt.xlabel('Frequency (cycles/day)')
plt.ylabel('Phase Difference (degrees)')
plt.grid(True)
plt.show()

plt.subplot(311)
plt.semilogy(frequencies, soil_psd, label='Soil Moisture PSD')
plt.semilogy(frequencies, precip_psd, label='Precipitation PSD')
plt.title('Power Spectral Density')
plt.xlabel('Frequency (cycles/day)')
plt.ylabel('Power')
plt.legend()

plt.subplot(312)
plt.semilogy(frequencies, np.abs(cpsd), label='Cross Power Spectral Density')
plt.title('Cross Power Spectral Density')
plt.xlabel('Frequency (cycles/day)')
plt.ylabel('Power')
plt.legend()

plt.subplot(313)
plt.plot(frequencies, coherence_values, label='Coherence')
plt.title('Coherence between Soil Moisture and Precipitation')
plt.xlabel('Frequency (cycles/day)')
plt.ylabel('Coherence')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
from scipy.fft import fft, fftfreq
from scipy.signal import periodogram, welch, coherence, csd, detrend
from sklearn.preprocessing import StandardScaler
#the approaches used here are adapted from SciPy documentation:
#https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.csd.html

scaler_soil = StandardScaler()
sm = pd.Series(scaler_soil.fit_transform(sm.values.reshape(-1, 1)).flatten(), index=sm.index)
scaled_p = StandardScaler()
precip_agg['tp'] = scaled_p.fit_transform(annual_precip_data['tp'].values.reshape(-1, 1)).flatten()

agg_frequencies = {'Monthly': 'M', 'Weekly': 'W', 'Daily': 'D', '3-Hourly': '3H'}

for freq_name, freq in agg_frequencies.items():
    #print(f"agg: {freq_name}")

    resampled_precip = annual_precip_data['tp'].resample(freq).sum().dropna()
    precip_diff = precip_agg.diff().dropna()
    smagg = sm.resample(freq).mean().dropna()
    smdiff = smagg.diff().dropna()
    
  #fidn the top frequencies and convert to time periods for easier interpretattion 
    timestep = {'M': 1, 'W': 1/4, 'D': 1/30, '3H': 1/240}[freq]
    Fs = 1 / timestep
    precip_diff_values = precip_diff.values
    n = len(precip_diff_values)
    y_fft = fft(precip_diff_values)
    fr = fftfreq(n, d=timestep)[:n//2]
    y_m = 2/n * np.abs(y_fft[:n//2])
    top = np.argsort(y_m)[-5:][::-1]
    top_frequencies = fr[top]
    top_magnitudes = y_m[top]
    top_periods = 1 / top_frequencies
    
    print("Top 5:")
    for i, (freq, mag, period) in enumerate(zip(top_frequencies, top_magnitudes, top_periods)):
        print(f"{i+1}. Frequency: {freq:.4f}, Magnitude: {mag:.4f}, Period: {period:.2f} {freq_name}")
    
    frequencies, power_spectral_density = periodogram(precip_diff_values, fs=Fs, window='flattop')
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
    
    ax[0].plot(precip_diff.index, precip_diff_values)
    ax[0].set_title(f'Differenced Precipitation Time Series - {freq_name}')
    ax[0].set_xlabel('Date')
    ax[0].set_ylabel('Differenced Precipitation')
    ax[0].grid(True)
    ax[1].stem(fr, y_m)
    ax[1].set_title(f'Frequency Domain - {freq_name}')
    ax[1].set_xlabel(f'Frequency ({freq_name})')
    ax[1].set_ylabel('Magnitude')
    ax[1].grid(True)
    
    plt.tight_layout()
    plt.show()
    
    plt.figure(figsize=(14, 7))
    plt.semilogy(frequencies, power_spectral_density)
    plt.title(f'Periodogram - {freq_name}')
    plt.xlabel(f'Frequency ({freq_name})')
    plt.ylabel('Power Spectral Density')
    plt.grid(True)
    plt.tight_layout()
    plt.show()


cross_corr = correlate(smdiff, precip_diff, mode='full')
lags = np.arange(-len(precip_diff) + 1, len(smdiff))
max_corr_index = np.argmax(np.abs(cross_corr))
max_corr = cross_corr[max_corr_index]
lag_at_max_corr = lags[max_corr_index]

print(f"Max corr: {max_corr}, Lag : {lag_at_max_corr}")

plt.plot(lags, cross_corr)
plt.axvline(x=lag_at_max_corr, color='r', linestyle='--', label=f'Lag: {lag_at_max_corr}')
plt.title('Cross-Correlation between Precipitation and Soil Moisture')
plt.xlabel('Lag')
plt.ylabel('Cross-Correlation')
plt.legend()
plt.show()

frequencies, soil_psd = welch(smdiff, fs=Fs)
_, precip_psd = welch(precip_diff, fs=Fs)
frequencies, cpsd = csd(smdiff, precip_diff, fs=Fs)
frequencies, coherence_values = coherence(smdiff, precip_diff, fs=Fs)

plt.figure(figsize=(14, 10))


plt.semilogy(frequencies, soil_psd, label='Soil Moisture PSD')
plt.semilogy(frequencies, precip_psd, label='Precipitation PSD')
plt.title('Power Spectral Density')
plt.xlabel('Frequency')
plt.ylabel('Power')
plt.legend()
plt.semilogy(frequencies, np.abs(cpsd), label='Cross Power Spectral Density')
plt.title('Cross Power Spectral Density')
plt.xlabel('Frequency ')
plt.ylabel('Power')
plt.legend()
plt.plot(frequencies, coherence_values, label='Coherence')
plt.title('Coherence between Soil Moisture and Precipitation')
plt.xlabel('Frequency ')
plt.ylabel('Coherence')
plt.legend()

plt.tight_layout()
plt.show()
