<a href="https://colab.research.google.com/github/BYU-Hydroinformatics/gwbf-notebooks/blob/main/2_GSLB_PCHIP_DeltaWTE_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1.Load ML model BFD results

In [1]:
import pandas as pd
import geopandas as gpd
import glob
import os
import time

def load_streamflow_data(data_folder):
    """
    Load all CSV files from the specified folder, parsing 'Date' column as datetime

    Parameters:
    data_folder: Path to the folder containing streamflow CSV files

    Returns:
    Dictionary: Keys are file names (without .csv), values are DataFrames
    """
    streamflow_data = {}
    csv_files = glob.glob(os.path.join(data_folder, '*.csv'))

    for file_path in csv_files:
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        # Parse 'Date' column as datetime while reading the CSV
        df = pd.read_csv(file_path, parse_dates=['Date'])
        streamflow_data[file_name] = df

    print(f"Loaded {len(streamflow_data)} CSV files: {list(streamflow_data.keys())}")
    return streamflow_data

In [2]:
from google.colab import drive
drive.mount('/content/drive')
data_folder = '/content/drive/My Drive/xueyi_research/GSLB_gages/'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
streamflow_data = load_streamflow_data(data_folder)
measurements = pd.read_csv('GSLB_1900-2023_TS_with_aquifers.csv')

Loaded 68 CSV files: ['10141000', '10026500', '10041000', '10134500', '10157500', '10015900', '10092700', '10105900', '10132000', '10106000', '10128500', '10159500', '10167000', '10133650', '10155500', '10129900', '10118000', '10136500', '10020300', '10171000', '10143500', '10132500', '10155000', '10079500', '10104700', '10153100', '10163000', '10155200', '10038000', '10133800', '10131000', '10168000', '10011500', '10130500', '10126000', '10020100', '10172860', '10152000', '10104900', '10039500', '10046500', '10142000', '10125500', '10109000', '10129300', '10172700', '10139300', '10023000', '10109001', '10113500', '10133600', '10016900', '10146000', '10102250', '10150500', '10137500', '10172952', '10140100', '10129500', '10154200', '10153800', '10058600', '10146400', '10015700', '10164500', '10068500', '10156000', '10168500']


In [4]:
type(streamflow_data) # To check the type of the streamflow_data variable itself


dict

In [5]:
measurements.head()

Unnamed: 0,AquiferID,Well_ID,Date,WTE,State
0,1,381033113480701,2012-09-06,7092.99,UT
1,1,381037113474001,2012-09-06,7175.95,UT
2,1,381152113442801,1995-11-22,6200.0,UT
3,1,381236113485601,2014-07-23,7151.0,UT
4,1,382113113435401,2008-09-03,5395.95,UT


## 1.1 PCHIP interpolation of WTE and calculate Delta_WTE

In [6]:
from scipy.interpolate import PchipInterpolator
import numpy as np

# Convert 'Date' column to datetime objects if not already done
measurements['Date'] = pd.to_datetime(measurements['Date'])

# Group data by 'Well'
grouped = measurements.groupby('Well_ID')

# Iterate through each well's data
interpolated_data = []
for well, data in grouped:
    if len(data) >= 2:
        # Create daily date range for the well's data period
        date_range = pd.date_range(start=data['Date'].min(), end=data['Date'].max(), freq='D')

        # Prepare x (date) and y (measurement) data for interpolation
        x = data['Date'].astype(np.int64) // 10**9  # Convert datetime to Unix timestamps in seconds
        y = data['WTE']

        # Create the PCHIP interpolator
        f = PchipInterpolator(x, y)

        # Interpolate for the full daily range
        x_new = date_range.astype(np.int64) // 10**9
        y_new = f(x_new)

        # Create a new DataFrame with daily interpolated data
        interpolated_df = pd.DataFrame({'Date': date_range, 'Well_ID': well, 'WTE': y_new})
        interpolated_data.append(interpolated_df)
    else:
        # Append original data if there are less than 2 data points
        interpolated_data.append(data)

# Concatenate all the interpolated dataframes
final_measurements = pd.concat(interpolated_data)

print(final_measurements.head())

   AquiferID          Well_ID       Date      WTE State
0        1.0  381033113480701 2012-09-06  7092.99    UT
1        1.0  381037113474001 2012-09-06  7175.95    UT
2        1.0  381152113442801 1995-11-22  6200.00    UT
3        1.0  381236113485601 2014-07-23  7151.00    UT
0        NaN  382113113435401 2008-09-03  5395.95   NaN


In [7]:
if 'AquiferID' in final_measurements.columns:
    final_measurements = final_measurements.drop('AquiferID', axis=1)
if 'State' in final_measurements.columns:
    final_measurements = final_measurements.drop('State', axis=1)
final_measurements.head()

Unnamed: 0,Well_ID,Date,WTE
0,381033113480701,2012-09-06,7092.99
1,381037113474001,2012-09-06,7175.95
2,381152113442801,1995-11-22,6200.0
3,381236113485601,2014-07-23,7151.0
0,382113113435401,2008-09-03,5395.95


In [18]:
# Group the final_measurements DataFrame by 'Well_ID'
grouped_wells = final_measurements.groupby('Well_ID')

# Store the results
delta_wte_data = []

# Iterate through each well
for well, data in grouped_wells:
    # Get the first WTE value for the well
    first_wte = data['WTE'].iloc[0]

    # Calculate Delta_WTE for each row in the well's data
    data['Delta_WTE'] = data['WTE'] - first_wte
    delta_wte_data.append(data)

# Concatenate the results back into a single DataFrame
final_measurements_delta = pd.concat(delta_wte_data)
final_measurements_delta.sort_values(by='Well_ID', inplace=True)

# Reduce precision of float columns
float_columns = final_measurements_delta.select_dtypes(include=['float64', 'float32','float16']).columns
for col in float_columns:
    final_measurements_delta[col] = final_measurements_delta[col].round(3).astype('float32')

print(final_measurements_delta.head())

              Well_ID       Date          WTE  Delta_WTE
0     381033113480701 2012-09-06  7092.990234      0.000
1     381037113474001 2012-09-06  7175.950195      0.000
2     381152113442801 1995-11-22  6200.000000      0.000
3     381236113485601 2014-07-23  7151.000000      0.000
3551  382113113435401 2018-05-25  5397.996094      2.046


In [20]:
# prompt: export final_measurements_delta to csv

# Export the final_measurements_delta DataFrame to a CSV file
final_measurements_delta.to_csv('final_measurements_delta.csv', index=False)

In [10]:
streamflow_data['10141000'].head()


Unnamed: 0,Date,Q,ML_BFD
0,2018-09-28,17.7,1.0
1,2018-09-29,18.1,0.0
2,2018-09-30,18.1,1.0
3,2018-10-01,17.4,1.0
4,2018-10-02,16.8,1.0


# 3.Create pairs

In [21]:
def find_pairs(streamflow_data, final_measurements_delta, output_dir='output', float_precision=3):
    """
    Finds pairs of well data and streamflow data based on matching dates and ML_BFD filter.
    Saves individual CSV files for each gage ID.

    Args:
        streamflow_data (dict): Dictionary of streamflow DataFrames.
        final_measurements_delta (pd.DataFrame): DataFrame containing well data with Delta_WTE.
        output_dir (str): Directory to store the output CSV files.

    Returns:
        list: List of created CSV filenames.
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    created_files = []  # Keep track of created files

    for gage_id, streamflow_df in streamflow_data.items():
        print(f"Working on Gage {gage_id}")
        start = time.time()

        # Filter streamflow data based on ML_BFD
        filtered_streamflow = streamflow_df[streamflow_df['ML_BFD'] == 1][['Date', 'Q']]
        print(f"Filtered Streamflow Data Shape: {filtered_streamflow.shape}")

        # Add gage_id to the filtered streamflow data
        filtered_streamflow['gage_id'] = gage_id

        # Perform the merge operation
        paired = filtered_streamflow.merge(
            final_measurements_delta[['Well_ID', 'Date', 'Delta_WTE', 'WTE']],
            on='Date',
            how='inner'  # Only keep matches
        )

        # Reorder columns to place 'gage_id' first
        paired = paired[['gage_id'] + [col for col in paired.columns if col != 'gage_id']]

        # # Reduce precision of float columns
        # float_columns = paired.select_dtypes(include=['float64', 'float32']).columns
        # for col in float_columns:
        #     paired[col] = paired[col].round(float_precision).astype('float32')

        # Write to individual file if pairs were found
        if not paired.empty:
            output_file = os.path.join(output_dir, f'paired_gage_{gage_id}.csv')
            paired.to_csv(output_file, index=False)
            created_files.append(output_file)

        # Clear memory
        del filtered_streamflow
        del paired

        print(f"Finished Gage {gage_id} with {(time.time()-start):.1f} seconds")


    return created_files

In [22]:
## find some samples

# Select a subset of gages for testing
selected_gages = list(streamflow_data.keys())[:1]  # Adjust the number as needed

# Create a temporary dictionary to hold the selected streamflow data
selected_streamflow_data = {}
for gage_id in selected_gages:
    selected_streamflow_data[gage_id] = streamflow_data[gage_id]

# Now use the selected_streamflow_data in the find_pairs function
paired_data = find_pairs(selected_streamflow_data, final_measurements_delta)


Working on Gage 10141000
Filtered Streamflow Data Shape: (825, 2)
Finished Gage 10141000 with 5.7 seconds


In [13]:
test_df=find_pairs(streamflow_data, merge)

NameError: name 'merge' is not defined