## Undercatch Correction ##

Implement undercatch corrections to station data


### Define input file locations ###

In [57]:
import xarray as xr
import pandas as pd
import os
from geopy.distance import geodesic
from scipy.spatial import KDTree
import matplotlib.pyplot as plt
from pathlib import Path
import shutil

import sys
sys.path.append('../scripts')
from undercatch_processing import *

# add autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Define input file locations ###

In [40]:
# Bow

# Tuolumne




Processing station BEISEKERAGCM
Processing station DELBURNEAGCM
Processing station BANFFCS
Processing station CHAMPIONAGDM
Processing station BOWVALLEY
Processing station CROWSNEST
Processing station SUNDREA
Processing station OLDSAGDM
Processing station PRENTISS
Processing station CRESTOMEREAGCM
Processing station JASPERWARDEN
Processing station NIERAGDM
Processing station BLOODTRIBEAGDM
Processing station STRATHMOREAGDM
Processing station HESPEROAGCM
Processing station LEEDALEAGDM
Processing station PINCHERCREEKCLIMATE
Processing station CALGARYINTLA
Processing station CALGARYINT'LA
Processing station REVELSTOKEAIRPORTAUTO
Processing station WETASKIWINAGCM
Processing station ROCKYMTNHOUSE(AUT)


In [11]:
import xarray as xr
import numpy as np
import pandas as pd
import os
# Load each dataset and rename the main variable
file_paths = ['/Users/dcasson/Data/gpep/tuolumne/data_prep/stations/wind_subset.nc',
            '/Users/dcasson/Data/gpep/tuolumne/data_prep/stations/tmean_subset.nc',
            '/Users/dcasson/Data/gpep/tuolumne/data_prep/stations/trange_subset.nc',
            '/Users/dcasson/Data/gpep/tuolumne/data_prep/stations/prcp_subset.nc']

output_file = '/Users/dcasson/Data/gpep/tuolumne/data_prep/stations/tuolumne_subset.nc'

def merge_scearth_subset_files(file_paths, output_file):
    """
    Merges multiple NetCDF files by 'station_ID' and updates the variable name 'SC_Earth_final' in each file to the filename prefix.
    
    Parameters:
    - file_paths: List of paths to the NetCDF files to be merged.
    - output_file: Path to the output NetCDF file.
    """
    # Create a list to store datasets
    datasets = []

    # Collect all station IDs across all files
    all_station_ids = set()

    # Iterate over the files and update SC_Earth_final variable name
    for file_path in file_paths:
        ds = xr.open_dataset(file_path)
        filename_prefix = os.path.basename(file_path).split('_')[0]
        ds = ds.rename({'SC_Earth_final': filename_prefix})
        
        # Ensure station_ID is a coordinate
        station_ids = [''.join(id_arr.astype(str)).strip() for id_arr in ds.station_ID.values]
        ds = ds.assign_coords(station_ID=station_ids)
        #ds = ds.swap_dims({'station_number': 'station_ID'})
        
        all_station_ids.update(station_ids)
        datasets.append(ds)
    
    # Convert to list and sort station IDs
    all_station_ids = sorted(list(all_station_ids))
    #Remove duplicates
    all_station_ids = list(set(all_station_ids))

    # Create a template dataset to ensure all station IDs are present
    template_ds = xr.Dataset(coords={'station_ID': all_station_ids})
    
    # Align each dataset with the template to ensure all station IDs are present
    extended_datasets = []
    for ds in datasets:
        aligned_ds = ds.reindex(station_ID=all_station_ids)
        extended_datasets.append(aligned_ds)
    
    # Merge all datasets based on station_ID
    merged_ds = xr.merge(extended_datasets, compat='override', join='outer')
    
    # Check for duplicates in station_ID and remove them
    _, index = np.unique(merged_ds['station_ID'], return_index=True)
    merged_ds = merged_ds.isel(station_ID=index)

    # Reset station_number dimension
    num_stations = merged_ds.sizes['station_ID']
    new_station_numbers = np.arange(num_stations)
    merged_ds = merged_ds.assign_coords(station_number=('station_ID', new_station_numbers))
    #merged_ds = merged_ds.swap_dims({'station_ID': 'station_number'})
    
    # Save the merged dataset to a new NetCDF file
    merged_ds.to_netcdf(output_file)

    return merged_ds

merge_scearth_subset_files(file_paths, output_file)

# Function to flatten the list of arrays to a single list
def flatten(nested_list):
    return [item for sublist in nested_list for item in sublist]

# Set to store unique station_IDs
unique_station_ids = set()

# Iterate over the files to collect station_IDs
for file_path in file_paths:
    ds = xr.open_dataset(file_path)
    print(ds['station_ID'].values)
    station_ids = flatten(ds['station_ID'].values)
    print(station_ids)
    #Convert to string and remove leading/trailing whitespace, keep each station ID as a single string
    station_ids = [''.join(id_arr.astype(str)).strip() for id_arr in station_ids]
    print(station_ids)
    unique_station_ids.update(station_ids)

num_unique_station_ids = len(unique_station_ids)
num_unique_station_ids

[[b'G' b'H' b'C' b'N' b'_' b'U' b'S' b'S' b'0' b'0' b'1' b'9' b'L' b'4'
  b'2' b'S']
 [b'G' b'H' b'C' b'N' b'_' b'U' b'S' b'W' b'0' b'0' b'0' b'2' b'3' b'2'
  b'5' b'7']
 [b'G' b'H' b'C' b'N' b'_' b'U' b'S' b'W' b'0' b'0' b'0' b'2' b'3' b'2'
  b'5' b'8']
 [b'G' b'S' b'O' b'D' b'_' b'7' b'2' b'0' b'6' b'1' b'5' b'0' b'0' b'2'
  b'0' b'6']
 [b'G' b'S' b'O' b'D' b'_' b'7' b'2' b'0' b'6' b'4' b'5' b'0' b'0' b'2'
  b'2' b'7']
 [b'G' b'S' b'O' b'D' b'_' b'7' b'2' b'3' b'8' b'9' b'4' b'0' b'3' b'1'
  b'8' b'1']
 [b'G' b'S' b'O' b'D' b'_' b'7' b'2' b'3' b'8' b'9' b'4' b'9' b'9' b'9'
  b'9' b'9']
 [b'G' b'S' b'O' b'D' b'_' b'7' b'2' b'4' b'8' b'0' b'0' b'2' b'3' b'1'
  b'5' b'7']
 [b'G' b'S' b'O' b'D' b'_' b'7' b'2' b'4' b'8' b'1' b'0' b'2' b'3' b'2'
  b'0' b'3']
 [b'G' b'S' b'O' b'D' b'_' b'7' b'2' b'4' b'8' b'1' b'5' b'9' b'9' b'9'
  b'9' b'9']
 [b'G' b'S' b'O' b'D' b'_' b'7' b'2' b'4' b'9' b'2' b'6' b'2' b'3' b'2'
  b'5' b'8']
 [b'G' b'S' b'O' b'D' b'_' b'7' b'2' b'5' b'8' b'4' b'7' b'9' b'3

35

### Filter Canadian Large Dataset, to remove old stations

In [3]:
def filter_stations_by_year(input_csv_path, output_csv_path, year):
    # Load the CSV file with appropriate delimiter and skip initial lines
    stations_df = pd.read_csv(input_csv_path, delimiter=',', skiprows=3)
    
    # Filter out rows where 'DLY Last Year' is before the specified year
    filtered_df = stations_df[stations_df['DLY Last Year'] >= year]
    
    # Save the filtered DataFrame to a new CSV file
    filtered_df.to_csv(output_csv_path, index=False)
    print(f"Filtered stations saved to {output_csv_path}")

filter_stations_by_year(cdn_complete_stations, cdn_year_filtered_stations, first_filter_year)

Filtered stations saved to /Users/dcasson/Data/pems/undercatch/cdn_station_inventory.csv


### Read stations from SC-Earth

In [21]:
def read_undercatch_stations(nc_file_path, output_csv_path):
    # Open the netCDF file using xarray
    ds = xr.open_dataset(nc_file_path)
    
    # Extract variables
    station_ids = ds['station_ID'].values
    print(station_ids)
    latitudes = ds['latitude'].values
    longitudes = ds['longitude'].values
    elevations = ds['elevation'].values
    
    # Process station_IDs to remove prefix
    # Convert each sublist to a single string
    station_id_str = [''.join([elem.decode('utf-8') for elem in sublist]) for sublist in station_ids]
    processed_station_ids = [sid.split('_')[1][:] for (sid) in station_id_str]
    
    # Create DataFrame
    df = pd.DataFrame({
        'full_station_ID':station_id_str,
        'station_ID': processed_station_ids,
        'latitude': latitudes,
        'longitude': longitudes,
        'elevation': elevations
    })
    
    # Output to CSV
    df.to_csv(output_csv_path, index=False)
    print(f"Undercatch stations saved to {output_csv_path}")
prcp_stations = '/Users/dcasson/Data/gpep/tuolumne/data_prep/stations/prcp_subset.nc'
prcp_stations_csv = '/Users/dcasson/Data/gpep/tuolumne/data_prep/stations/prcp_subset.csv'
read_undercatch_stations(prcp_stations, prcp_stations_csv )

[[b'G' b'H' b'C' ... b'0' b'0' b'3']
 [b'G' b'H' b'C' ... b'0' b'1' b'0']
 [b'G' b'H' b'C' ... b'0' b'0' b'2']
 ...
 [b'G' b'S' b'O' ... b'2' b'3' b'0']
 [b'G' b'S' b'O' ... b'1' b'5' b'0']
 [b'G' b'S' b'O' ... b'2' b'4' b'3']]
Undercatch stations saved to /Users/dcasson/Data/gpep/tuolumne/data_prep/stations/prcp_subset.csv


### Merge station metadata

In [5]:
def join_datasets(file1_path, file2_path, join_column_file1, join_column_file2, output_path, join_type='inner'):
    # Load the datasets
    df1 = pd.read_csv(file1_path)
    df2 = pd.read_csv(file2_path)

    # Ensure the columns for joining are strings and strip trailing .0 if present
    df1[join_column_file1] = df1[join_column_file1].astype(str).str.rstrip('.0')
    df2[join_column_file2] = df2[join_column_file2].astype(str).str.rstrip('.0')

    # Perform the join
    merged_df = pd.merge(df1, df2, left_on=join_column_file1, right_on=join_column_file2, how=join_type)

    # Save the result to CSV
    merged_df.to_csv(output_path, index=False)
    print(f"Joined {file1_path} and {file2_path} saved to {output_path}")

join_datasets(undercatch_stations_csv, cdn_year_filtered_stations, 'station_ID', 'WMO ID', undercatch_stations_update_csv)

join_datasets(undercatch_stations_update_csv,smith_2019_stations,'Climate ID', 'StationID', evaluation_stations_csv)

Joined /Users/dcasson/Data/pems/undercatch/undercatch_from_nc.csv and /Users/dcasson/Data/pems/undercatch/cdn_station_inventory.csv saved to /Users/dcasson/Data/pems/undercatch/undercatch_stations_update.csv
Joined /Users/dcasson/Data/pems/undercatch/undercatch_stations_update.csv and /Users/dcasson/Data/pems/station_data/EN_StationCatalogue_2019.csv saved to /Users/dcasson/Data/pems/undercatch/evaluation_stations.csv


In [59]:
def extract_station_data(nc_file_path, csv_file_path, output_dir):
    # Load the netCDF file
    ds = xr.open_dataset(nc_file_path)
    
    # Load the CSV file
    stations_df = pd.read_csv(csv_file_path)
    
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Create a dictionary to map station_ID to station_number
    station_id_map = {str(ds['station_ID'][i].values): i for i in range(len(ds['station_ID']))}
    
    # Iterate through the stations in the CSV file
    for _, row in stations_df.iterrows():
        climate_id = row['Climate ID']
        if not pd.isna(climate_id):
            station_id = row['full_station_ID']
            station_number = station_id_map.get(station_id, None)
            
            if station_number is not None:
                # Extract data for the station
                time = ds['time'].values
                prcp = ds['prcp'].isel(station_number=station_number).values
                tmean = ds['tmean'].isel(station_number=station_number).values
                wind = ds['wind'].isel(station_number=station_number).values
                
                # Create a DataFrame for the station data
                station_data = pd.DataFrame({
                    'time': time,
                    'prcp': prcp,
                    'tmean': tmean,
                    'wind': wind
                })
                
                # Define the output file path
                output_file_path = os.path.join(output_dir, f'station_{str(climate_id)}.csv')
                
                # Save the DataFrame to a CSV file
                station_data.to_csv(output_file_path, index=False)
                print(f"Station data saved to {output_file_path}")

extract_station_data(undercatch_stations_nc, undercatch_stations_update_csv, output_station_path)


Station data saved to /Users/dcasson/Data/pems/undercatch/undercatch_stations/station_3020610.csv
Station data saved to /Users/dcasson/Data/pems/undercatch/undercatch_stations/station_3012050.csv
Station data saved to /Users/dcasson/Data/pems/undercatch/undercatch_stations/station_3050519.csv
Station data saved to /Users/dcasson/Data/pems/undercatch/undercatch_stations/station_3031480.csv
Station data saved to /Users/dcasson/Data/pems/undercatch/undercatch_stations/station_3050778.csv
Station data saved to /Users/dcasson/Data/pems/undercatch/undercatch_stations/station_3051R4R.csv
Station data saved to /Users/dcasson/Data/pems/undercatch/undercatch_stations/station_3026KNQ.csv
Station data saved to /Users/dcasson/Data/pems/undercatch/undercatch_stations/station_3024925.csv
Station data saved to /Users/dcasson/Data/pems/undercatch/undercatch_stations/station_3025297.csv
Station data saved to /Users/dcasson/Data/pems/undercatch/undercatch_stations/station_3011892.csv
Station data saved t

In [60]:
def calculate_undercatch_for_gsod_stations(directory):
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)
            
            # Ensure required columns are present
            if all(col in df.columns for col in ['wind', 'tmean', 'prcp']):
                # Calculate CE and corrected precipitation
                df['CE'] = df.apply(lambda row: calculate_CE(row['wind'], row['tmean']), axis=1)
                df['corrected_prcp'] = df.apply(lambda row: apply_undercatch(row['prcp'], row['CE']), axis=1)
                #Update so that if corrected_prcp is 0, set CE to 1
                #df.loc[df['corrected_prcp'] == 0, 'CE'] = 1
                
                # Save the updated DataFrame back to the CSV file
                df.to_csv(file_path, index=False)
                print(f"Undercatch calculated for {file_path}")

# Process the CSV files in the directory
calculate_undercatch_for_gsod_stations(output_station_path)

Undercatch calculated for /Users/dcasson/Data/pems/undercatch/undercatch_stations/station_3053536.csv
Undercatch calculated for /Users/dcasson/Data/pems/undercatch/undercatch_stations/station_3030720.csv
Undercatch calculated for /Users/dcasson/Data/pems/undercatch/undercatch_stations/station_3023200.csv
Undercatch calculated for /Users/dcasson/Data/pems/undercatch/undercatch_stations/station_3011892.csv
Undercatch calculated for /Users/dcasson/Data/pems/undercatch/undercatch_stations/station_3034795.csv
Undercatch calculated for /Users/dcasson/Data/pems/undercatch/undercatch_stations/station_3051R4R.csv
Undercatch calculated for /Users/dcasson/Data/pems/undercatch/undercatch_stations/station_3026KNQ.csv
Undercatch calculated for /Users/dcasson/Data/pems/undercatch/undercatch_stations/station_3012050.csv
Undercatch calculated for /Users/dcasson/Data/pems/undercatch/undercatch_stations/station_1176755.csv
Undercatch calculated for /Users/dcasson/Data/pems/undercatch/undercatch_stations/

In [62]:
def convert_txt_to_csv(txt_file, csv_file):
    # Read the .txt file
    with open(txt_file, 'r') as file:
        lines = file.readlines()

    # Assume the first non-empty line is the header and skip the second header line
    header = next(line for line in lines if line.strip())
    lines = lines[lines.index(header) + 2:]  # Skip the header line and the next line

    # Write the content to a .csv file with the extracted header
    with open(csv_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(header.split())

        for line in lines:
            if line.strip():
                writer.writerow(line.split())

def calculate_daily_averages_from_eccc_paper(hourly_data_file, daily_data_file):

    # Define columns
    columns = ["YYYYMMDDThhmm", "Unadj_P(mm)", "Tair(C)", "Wind(m/s)", "Wind_Flag", 
               "CE", "UTF_Adj_P(mm)", "CODECON(mm)", "UTF_Adj+CODECON_P(mm)", "Adj_Flag"]
    
    # Read the data file
    df = pd.read_csv(hourly_data_file, delim_whitespace=True, skiprows=2, names=columns)
    
    # Replace -99999 with NaN
    df.replace(-99999, np.nan, inplace=True)

    # Read date column to datetime
    df['YYYYMMDDThhmm'] = pd.to_datetime(df['YYYYMMDDThhmm'], format='%Y%m%dT%H%M')

    # Shift the date column back by 1 hour
    df['YYYYMMDDThhmm'] = df['YYYYMMDDThhmm'] + pd.Timedelta(hours=1)

    # Write YYYYMMDD to Date column
    df['Date'] = df['YYYYMMDDThhmm'].dt.strftime('%Y%m%d')
    
    # Extract date part from datetime
    #df['Date'] = df['YYYYMMDDThhmm'].str[:8]
    
    # Calculate daily averages, ignoring flags
    daily_avg = df.groupby('Date').agg({
        "Unadj_P(mm)": "sum",
        "Tair(C)": "mean",
        "Wind(m/s)": "mean",
        "CE": "mean",
        "UTF_Adj_P(mm)": "sum",
        "CODECON(mm)": "mean",
        "UTF_Adj+CODECON_P(mm)": "sum"
    }).reset_index()
    
    # Write daily averages to a CSV file
    daily_avg.to_csv(daily_data_file, index=False)
    print(f"Daily averages have been written to {daily_data_file}")


def read_station_id_list(csv_file, column_name):
    # Read the CSV file
    df = pd.read_csv(csv_file)
    
    # Return a list of the entries in the specified column
    return df[column_name].tolist()

def select_and_convert_raw_data_stations(station_id_list,hourly_path, daily_path):
    for station in station_id_list:
        station = str(station)
        for file_name in os.listdir(hourly_path):
            if station in file_name and file_name.endswith('.txt'):
                # Construct the full file paths
                hourly_file = os.path.join(hourly_path, file_name)
                daily_file = os.path.join(daily_path, file_name)

                #Update .txt to .csv
                daily_file = daily_file.replace('.txt', '.csv')

                # Define the destination .csv file name
                calculate_daily_averages_from_eccc_paper(hourly_file, daily_file)
                print(f"Converted and copied: {hourly_file} to {daily_file}")


def merge_obs_and_model_csv_files(station_id_list, smith_2019_daily_path, gsod_path, merged_station_path,station_name_list):

    for station, name in zip(station_id_list,station_name_list):
        station = str(station)
        for file_name in os.listdir(smith_2019_daily_path):
            if station in file_name and file_name.endswith('.csv'):
                # Construct the full file paths
                obs_file = os.path.join(smith_2019_daily_path, file_name)
                model_file = os.path.join(gsod_path, f'station_{station}.csv')
                df1 = pd.read_csv(obs_file)
                df2 = pd.read_csv(model_file)

                # Rename the date columns to a common name
                df1 = df1.rename(columns={'Date': 'date'})
                df2 = df2.rename(columns={'time': 'date'})

                # Strip any leading/trailing whitespace from the date column in df1
                df1['date'] = df1['date'].astype(str).str.strip()
                
                # Convert the 'YYYYMMDD' date format in df1 to a proper datetime format
                df1['date'] = pd.to_datetime(df1['date'])
                
                # Convert the date column in df2 to datetime format
                df2['date'] = pd.to_datetime(df2['date'], format='%Y-%m-%d')
                
                # Convert all other columns to numeric, coercing errors to NaN
                for col in df1.columns:
                    if col != 'date':
                        df1[col] = pd.to_numeric(df1[col], errors='coerce')
                
                for col in df2.columns:
                    if col != 'date':
                        df2[col] = pd.to_numeric(df2[col], errors='coerce')
                
                # Merge the dataframes on the 'date' column
                merged_df = pd.merge(df1, df2, on='date', how='outer')
                
                # Drop rows where there are NaN values in any of the columns
                merged_df = merged_df.dropna()
                
                # Save the merged DataFrame to a new CSV file
                merged_df.to_csv(Path(merged_station_path, f'{name}.csv'), index=False)
                print(f"Merged data saved to {merged_station_path}/{name}.csv")

station_id_list = read_station_id_list(evaluation_stations_csv, 'Climate ID')
station_name_list = read_station_id_list(evaluation_stations_csv, 'Name')
station_name_list = [s.replace(' ', '') for s in station_name_list]


select_and_convert_raw_data_stations(station_id_list, smith_2019_hourly_path, smith_2019_daily_path)
merge_obs_and_model_csv_files(station_id_list, smith_2019_daily_path, output_station_path, merged_station_path, station_name_list)


  df = pd.read_csv(hourly_data_file, delim_whitespace=True, skiprows=2, names=columns)


Daily averages have been written to /Users/dcasson/Data/pems/smith_2019/daily_data/3050519_UTF_hly_prec.csv
Converted and copied: /Users/dcasson/Data/pems/smith_2019/hourly_data/3050519_UTF_hly_prec.txt to /Users/dcasson/Data/pems/smith_2019/daily_data/3050519_UTF_hly_prec.csv


  df = pd.read_csv(hourly_data_file, delim_whitespace=True, skiprows=2, names=columns)


Daily averages have been written to /Users/dcasson/Data/pems/smith_2019/daily_data/3050778_UTF_hly_prec.csv
Converted and copied: /Users/dcasson/Data/pems/smith_2019/hourly_data/3050778_UTF_hly_prec.txt to /Users/dcasson/Data/pems/smith_2019/daily_data/3050778_UTF_hly_prec.csv


  df = pd.read_csv(hourly_data_file, delim_whitespace=True, skiprows=2, names=columns)


Daily averages have been written to /Users/dcasson/Data/pems/smith_2019/daily_data/3053536_UTF_hly_prec.csv
Converted and copied: /Users/dcasson/Data/pems/smith_2019/hourly_data/3053536_UTF_hly_prec.txt to /Users/dcasson/Data/pems/smith_2019/daily_data/3053536_UTF_hly_prec.csv


  df = pd.read_csv(hourly_data_file, delim_whitespace=True, skiprows=2, names=columns)


Daily averages have been written to /Users/dcasson/Data/pems/smith_2019/daily_data/3035208_UTF_hly_prec.csv
Converted and copied: /Users/dcasson/Data/pems/smith_2019/hourly_data/3035208_UTF_hly_prec.txt to /Users/dcasson/Data/pems/smith_2019/daily_data/3035208_UTF_hly_prec.csv


  df = pd.read_csv(hourly_data_file, delim_whitespace=True, skiprows=2, names=columns)


Daily averages have been written to /Users/dcasson/Data/pems/smith_2019/daily_data/1176755_UTF_hly_prec.csv
Converted and copied: /Users/dcasson/Data/pems/smith_2019/hourly_data/1176755_UTF_hly_prec.txt to /Users/dcasson/Data/pems/smith_2019/daily_data/1176755_UTF_hly_prec.csv


  df = pd.read_csv(hourly_data_file, delim_whitespace=True, skiprows=2, names=columns)


Daily averages have been written to /Users/dcasson/Data/pems/smith_2019/daily_data/3015523_UTF_hly_prec.csv
Converted and copied: /Users/dcasson/Data/pems/smith_2019/hourly_data/3015523_UTF_hly_prec.txt to /Users/dcasson/Data/pems/smith_2019/daily_data/3015523_UTF_hly_prec.csv
Merged data saved to /Users/dcasson/Data/pems/undercatch/merged_stations/BANFFCS.csv
Merged data saved to /Users/dcasson/Data/pems/undercatch/merged_stations/BOWVALLEY.csv
Merged data saved to /Users/dcasson/Data/pems/undercatch/merged_stations/JASPERWARDEN.csv
Merged data saved to /Users/dcasson/Data/pems/undercatch/merged_stations/PINCHERCREEKCLIMATE.csv
Merged data saved to /Users/dcasson/Data/pems/undercatch/merged_stations/REVELSTOKEAIRPORTAUTO.csv
Merged data saved to /Users/dcasson/Data/pems/undercatch/merged_stations/ROCKYMTNHOUSE(AUT).csv


In [42]:
import pandas as pd
import matplotlib.pyplot as plt

def generate_accumulated_precipitation_plots_per_water_year(input_csv_path,station,output_dir):
    df = pd.read_csv(input_csv_path)
    
    # Convert date column to datetime format
    df['date'] = pd.to_datetime(df['date'])
    
    # Add a water year column
    df['water_year'] = df['date'].apply(lambda x: x.year if x.month < 10 else x.year + 1)
    
    # Get unique water years
    water_years = df['water_year'].unique()
    
    # Calculate the number of rows and columns for subplots
    num_plots = len(water_years)
    num_cols = math.ceil(math.sqrt(num_plots))
    num_rows = math.ceil(num_plots / num_cols)
    
    # Create a figure with subplots
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(24, 6 * num_rows), sharex=False)
    
    # Flatten axes array for easy iteration
    if num_plots == 1:
        axes = [axes]
    else:
        axes = axes.flatten()
    
    # Calculate accumulated precipitation for each water year and plot
    for ax, water_year in zip(axes, water_years):
        group = df[df['water_year'] == water_year].sort_values(by='date')
        group['Accum_Unadj_P(mm)'] = group['Unadj_P(mm)'].cumsum()
        group['Accum_prcp'] = group['prcp'].cumsum()
        group['Accum_UTF_Adj_P(mm)'] = group['UTF_Adj_P(mm)'].cumsum()
        group['Accum_corrected_prcp'] = group['corrected_prcp'].cumsum()

        ax.plot(group['date'], group['Accum_Unadj_P(mm)'], label='Raw Gauge (ECCC)', linestyle='-', color='red')
        ax.plot(group['date'], group['Accum_prcp'], label='Raw Gauge (SC-Earth)', linestyle='-', color='blue')
        ax.plot(group['date'], group['Accum_UTF_Adj_P(mm)'], label='Undercatch Corrected (ECCC)', linestyle=':', color='red')
        ax.plot(group['date'], group['Accum_corrected_prcp'], label='Undercatch Corrected (SC-Earth)', linestyle=':', color='blue')
        ax.set_xlabel('Date')
        ax.set_ylabel('Accumulated Precipitation (mm)')
        ax.set_title(f'Accumulated Precipitation (Water Year {water_year})')
        ax.legend()
        ax.grid(True)
        
        # Maintain the right x-axis bound
        ax.set_xlim([group['date'].min(), group['date'].max()])
    
    # Hide any unused subplots
    for i in range(num_plots, num_rows * num_cols):
        fig.delaxes(axes[i])
    
    # Adjust layout to prevent overlap
    plt.tight_layout()

    # Save the combined plot to a PNG file
    output_plot_path = os.path.join(output_dir, f'{station}_accumulated_precipitation.png')
    plt.savefig(output_plot_path)
    plt.close()

# Run the function to generate the plots

for station in station_name_list:
    station = str(station)
    comparative_file = Path(merged_station_path,f'{station}.csv')
    plot_path = Path(output_path,'plots')
    os.makedirs(plot_path, exist_ok=True)
    generate_accumulated_precipitation_plots_per_water_year(comparative_file,station,plot_path)



In [63]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def plot_ce_boxplot(file_path,station,output_dir,legend_title='Method', method_labels=None):
    """
    Plots a boxplot of CE for each method aggregated by month.

    Parameters:
    - file_path: str, path to the CSV file
    - legend_title: str, title for the legend
    - method_labels: dict, dictionary to map method column names to desired labels
    """
    # Load the CSV file
    data = pd.read_csv(file_path)

    # Convert the date column to datetime format
    data['date'] = pd.to_datetime(data['date'])

    # Extract the month from the date column
    data['month'] = data['date'].dt.month

    # Melt the dataset to have one column for method and another for CE values
    melted_data = pd.melt(data, id_vars=['month'], value_vars=['CE_x', 'CE_y'], var_name='method', value_name='CE')
    #Drop all values where CE is 1
    melted_data = melted_data[melted_data['CE'] != 1]

    # Apply method labels if provided
    if method_labels:
        melted_data['method'] = melted_data['method'].map(method_labels)

    # Create the boxplot
    plt.figure(figsize=(14, 8))
    sns.boxplot(x='month', y='CE', hue='method', data=melted_data)
    plt.title(f'Boxplot of CE for {station}')
    plt.xlabel('Month')
    plt.ylabel('CE')
    plt.legend(title=legend_title)
    plt.grid(True)
    # Save the plot to a PNG file
    output_plot_path = f'{output_dir}/CE_{station}.png'
    plt.savefig(output_plot_path)
    plt.close()

# Usage example
for station in station_name_list:
    station = str(station)
    comparative_file = Path(merged_station_path,f'{station}.csv')
    
    method_labels = {'CE_x': 'ECCC Analysis', 'CE_y': 'SPICE for SC-Earth'}
    plot_ce_boxplot(comparative_file,station,plot_path,legend_title='CE Methods', method_labels=method_labels)

In [45]:
import pandas as pd
from scipy import stats

def generate_comparative_plots(file_path, col1, col2, output_dir, station_name, x_label=None, y_label=None):
    # Load the CSV file
    df = pd.read_csv(file_path)
    
    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Plot correlation with regression line
    plt.figure(figsize=(10, 6))
    sns.regplot(x=col1, y=col2, data=df, line_kws={"color": "red", "alpha": 0.7, "lw": 2})
    
    # Calculate correlation coefficient and regression equation
    slope, intercept, r_value, p_value, std_err = stats.linregress(df[col1], df[col2])
    regression_formula = f'Y = {intercept:.2f} + {slope:.2f}X'
    correlation_coefficient = f'R^2 = {r_value**2:.2f}'
    
        # Add a 1:1 line for reference
    max_val = max(df[col1].max(), df[col2].max())
    min_val = min(df[col1].min(), df[col2].min())
    plt.plot([min_val, max_val], [min_val, max_val], color='blue', linestyle='--', linewidth=1, label='1:1 Line')
    
    # Annotate plot with regression equation and R^2
    plt.text(0.05, 0.95, regression_formula, transform=plt.gca().transAxes, fontsize=12, verticalalignment='top')
    plt.text(0.05, 0.90, correlation_coefficient, transform=plt.gca().transAxes, fontsize=12, verticalalignment='top')
    
    # Set labels
    plt.title(f'Correlation between {col1} and {col2}')
    plt.xlabel(x_label if x_label else col1)
    plt.ylabel(y_label if y_label else col2)
    plt.grid(True)
    
    # Define the output file path
    output_file = os.path.join(output_dir, f'{station_name}_{col2}.png')
    
    # Save the plot to the specified directory
    plt.savefig(output_file)
    plt.close()
    print(f"Plot saved to {output_file}")

for station in station_name_list:
    station = str(station)
    comparative_file = Path(merged_station_path,f'{station}.csv')
    generate_comparative_plots(comparative_file, 'Wind(m/s)', 'wind',plot_path, station, x_label='ECCC Wind', y_label='EC-Earth Wind')
    generate_comparative_plots(comparative_file, 'Tair(C)', 'tmean', plot_path, station, x_label='ECCC Temperature (°C)', y_label='EC-Earth Temperature (°C)')
    

Plot saved to /Users/dcasson/Data/pems/undercatch/plots/BANFFCS_wind.png
Plot saved to /Users/dcasson/Data/pems/undercatch/plots/BANFFCS_tmean.png
Plot saved to /Users/dcasson/Data/pems/undercatch/plots/BOWVALLEY_wind.png
Plot saved to /Users/dcasson/Data/pems/undercatch/plots/BOWVALLEY_tmean.png
Plot saved to /Users/dcasson/Data/pems/undercatch/plots/JASPERWARDEN_wind.png
Plot saved to /Users/dcasson/Data/pems/undercatch/plots/JASPERWARDEN_tmean.png
Plot saved to /Users/dcasson/Data/pems/undercatch/plots/PINCHERCREEKCLIMATE_wind.png
Plot saved to /Users/dcasson/Data/pems/undercatch/plots/PINCHERCREEKCLIMATE_tmean.png
Plot saved to /Users/dcasson/Data/pems/undercatch/plots/REVELSTOKEAIRPORTAUTO_wind.png
Plot saved to /Users/dcasson/Data/pems/undercatch/plots/REVELSTOKEAIRPORTAUTO_tmean.png
Plot saved to /Users/dcasson/Data/pems/undercatch/plots/ROCKYMTNHOUSE(AUT)_wind.png
Plot saved to /Users/dcasson/Data/pems/undercatch/plots/ROCKYMTNHOUSE(AUT)_tmean.png
