In [None]:
import rasterio
from rasterio.warp import transform
import pandas as pd
import numpy as np
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import matplotlib.pyplot as plt
import os
import h5py
import matplotlib.pyplot as plt
import mplfinance as mpf
import matplotlib.dates as mdates
from mplfinance.original_flavor import candlestick_ohlc
import geopandas as gpd


Send request to NASA for the relevant data.

In [None]:
iowa_bbox = [-96.639704, 40.375437, -90.140061, 43.501196]  

def get_smapL4_data(min_lon, min_lat, max_lon, max_lat, session):
    bounding_box = f"{min_lon},{min_lat},{max_lon},{max_lat}"
    params = {
        'short_name': 'SPL4SMGP',
        'version': '007',
        'temporal': '2021-06-30T00:00:00Z,2022-03-31T23:59:59Z',
        'bounding_box': bounding_box,
        'bbox': bounding_box,
        'format': 'HDF-EOS5',
        'projection': 'GEOGRAPHIC',
        'page_size': 2000,
        'request_mode': 'async',
        'email': ''
    }
    headers = {
        'Authorization': f'Bearer {token}'
    }

    site = 'https://n5eil02u.ecs.nsidc.org/egi/request?'
    url = site + "&".join([f"{key}={value}" for key, value in params.items()])
    response = session.get(url, headers=headers, timeout=30)
    if response.status_code == 401:
        response = session.get(response.url)
    response.raise_for_status()
    return response.content

def get_data(min_lon, min_lat, max_lon, max_lat, session):
    try:
        data = get_smapL4_data(min_lon, min_lat, max_lon, max_lat, session)
        if data:
            print("Data has been requested")
    except Exception as e:
        print(f"Error encountered: {e}")
    return False

token = 
iowa_min_lon, iowa_min_lat, iowa_max_lon, iowa_max_lat = iowa_bbox

with requests.Session() as session:
    get_data(iowa_min_lon, iowa_min_lat, iowa_max_lon, iowa_max_lat, session)



Open data file and extract relevant information

In [None]:
root = ''
all_data = []
for dir, dirs, files in os.walk(root):
    for file in files:
        if file.endswith('.h5') or file.endswith('.he5'):  
            file_path = os.path.join(dir, file)
            try:
                with h5py.File(file_path, mode='r') as f:
                   
                    if 'gph' in file:
                        dataset_path = '/HDFEOS/GRIDS/Geophysical_Data/Data Fields/sm_surface'
                    else:
                        dataset_path = '/HDFEOS/GRIDS/Analysis_Data/Data Fields/sm_surface_analysis'
                    
           
                    data = f[dataset_path][:]
                    attrs = f[dataset_path].attrs
                    _FillValue = attrs['_FillValue']
                    valid_max = attrs['valid_max']
                    valid_min = attrs['valid_min']
                    invalid = np.logical_or(data > valid_max, data < valid_min)
                    invalid = np.logical_or(invalid, data == _FillValue)
                    data[invalid] = np.nan
                    data = np.ma.masked_where(np.isnan(data), data)
                    lat_path = '/HDFEOS/GRIDS/FileMainGroup/Data Fields/cell_lat'
                    lon_path = '/HDFEOS/GRIDS/FileMainGroup/Data Fields/cell_lon'
                    latitude = f[lat_path][:]
                    longitude = f[lon_path][:]
                    
                    try:
                        #print(f"filename: {file}")
                        date_str = file.split('_')[4]
                        #print(f"date string: {date_str}")
                        date = pd.to_datetime(date_str, format='%Y%m%dT%H%M%S')
                        #print(f"date: {date}")
                    except (IndexError, ValueError) as date_error:
                        print(f"Date extraction failed for filename: {file}. Error: {date_error}")
                        date = None 
                    for lat, lon, sm in zip(latitude.flatten(), longitude.flatten(), data.flatten()):
                        all_data.append({'date': date, 'latitude': lat, 'longitude': lon, 'soil_moisture': sm, 'file': file})


            except OSError as e:
                print(f"problem opening file: {e}")
            except KeyError as e:
                print(f"not in file {file_path}: {e}")
            except Exception as e:
                print(f"problem processing file {file_path}: {e}")


df = pd.DataFrame(all_data)

'''if 'date' in df.columns and not df['date'].isnull().all():
    df['date'] = pd.to_datetime(df['date'])
else:
    print("'date' column invalid.")'''


Here we go to QGIS and adapt the resolution accordingly and then return and extract data from the new csv file. In QGIS indexes were assigned based on whether a point (lat, lon) was within a grid cell. Then saves as a csv, so filtering by index means we can seperate by grid and therefore aggregate spatially. 

In [None]:
path = ''
df = pd.read_csv(path, parse_dates=['date'])
df['date'] = pd.to_datetime(df['date'])
df_agg = df[df['id'] == 1]
end = pd.Timestamp('2023-06-07 23:59:59')
start = end - pd.Timedelta(days=6)
df_agg = df_agg[(df_agg['date'] >= start) & (df_agg['date'] <= end)]

def aggregate(df):
    aggregated = df.groupby(['date', 'latitude', 'longitude']).agg({
        'soil_moisture': ['first', 'last', 'max', 'min', 'mean']
    }).reset_index()
    
    aggregated.columns = ['date', 'latitude', 'longitude', 'open', 'close', 'max', 'min', 'mean']
    
    return aggregated
def open_val_shift(df):
    df['open'] = df['close'].shift(1)
    df['open'].iloc[0] = mean_open  
    return df


aggregated = aggregate(df_agg)
mean_open = aggregated['open'].mean()
aggregated = aggregated.groupby(['latitude', 'longitude']).apply(open_val_shift).reset_index(drop=True)
ans = aggregated.groupby('date').agg({
    'open': 'mean',  # Mean of the open values
    'close': 'mean',  # Mean of the close values
    'max': 'max',     # Maximum of all max values
    'min': 'min',     # Minimum of all min values
    'mean': 'mean'    # Mean of all mean values
}).reset_index()


print(ans)

def candlestick(df, title):
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)
    df.index.name = 'Date'
    df = df.rename(columns={'open': 'Open', 'close': 'Close', 'max': 'High', 'min': 'Low'})
    mpf.plot(df, type='candle', style='charles', title=title, ylabel='Soil Moisture', volume=False)


candlestick(ans, 'Candlestick Plot for SM at 27km Resolution')


In [None]:
soil_moisture_stats = df['soil_moisture'].describe()
print("Soil Moisture Statistics:")
print(soil_moisture_stats)

dailySm = df['soil_moisture'].resample('D')

In [None]:
regions = {
    'Northwest': {'lat_min': 42, 'lat_max': 44, 'lon_min': -96, 'lon_max': -94},
    'Central': {'lat_min': 41, 'lat_max': 43, 'lon_min': -94, 'lon_max': -92},
    'Southeast': {'lat_min': 40, 'lat_max': 42, 'lon_min': -92, 'lon_max': -90}
}

def regional(row):
    for region, bounds in regions.items():
        if bounds['lat_min'] <= row['latitude'] <= bounds['lat_max'] and bounds['lon_min'] <= row['longitude'] <= bounds['lon_max']:
            return region
    return 'Other'

df['region'] = df.apply(regional, axis=1)
df_filtered = df[df['region'] != 'Other']
regions = (df_filtered.groupby(['region', pd.Grouper(key='date', freq='D')])['soil_moisture'].mean().reset_index()).pivot(index='date', columns='region', values='soil_moisture')


plt.figure(figsize=(12, 6))
for region in regions.columns:
    plt.plot(regions.index, regions[region], label=region)

plt.xlabel('Date')
plt.ylabel('Volumetric Soil Moisture')
plt.title('Soil Moisture Time Series Daily Resolution Aggregated by Region for June')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45, ha='right')
plt.show()