# Notebook to download MLFlow Metrics to a local file 

# Step 0 - Set up Notebook 

In [5]:
# import needed libaries
import os
import boto3
import mlflow
import time
from PIL import Image
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
from snowML.datapipe import snow_types as st
from snowML.datapipe import get_geos as gg
from snowML.datapipe import data_utils as du
from snowML.datapipe import get_dem as gd

In [6]:
# initialize erathengine credentials
import ee
ee.Authenticate()

True

# Step 1 Get MLFlow Metrics

In [7]:
# function to retrieve metrics from ML server 
def load_ml_metrics(tracking_uri, run_id, save_local=False):
    mlflow.set_tracking_uri(tracking_uri)
    client = mlflow.MlflowClient()
    # Get all metric keys from the run
    run_data = client.get_run(run_id).data
    metric_keys = run_data.metrics.keys()
    # Retrieve full metric history for each key
    all_metrics = []
    for metric in metric_keys:
        history = client.get_metric_history(run_id, metric)
        for record in history:
            all_metrics.append({
                "Metric": metric,
                "Step": record.step,
                "Value": record.value
            })
    
    # Convert to DataFrame
    metrics_df = pd.DataFrame(all_metrics)
    
    # Save to CSV if needed
    if save_local:
        f_out = f"run_id_data/metrics_from_{run_id}.csv"
        metrics_df.to_csv(f_out, index=False)

    return metrics_df

In [8]:
# function to extract only a specific metric
def extract_metric(df, metric_name):
    """Extracts rows where the Metric column ends with 'metric_name' and returns only Metric and Value columns."""
    return df[df['Metric'].str.endswith(metric_name)][['Metric', 'Value']].sort_values(by='Metric')

In [9]:
# function to extract data from a given epoch 
def summarize_by_step(df, step, agg_lev = 12):
    df_filtered = df[df["Step"] == step].copy()
    df_filtered["Metric_Type"] = df_filtered["Metric"].str.extract(r"(test_mse|test_kge|train_mse|train_kge)")
    df_filtered["HUC_ID"] = df_filtered["Metric"].str.extract(fr"(\d{{{agg_lev}}})")  

    # Take mean across HUC_ID if duplicates exist
    if df_filtered.duplicated(subset=["HUC_ID", "Metric_Type"]).any():
        df_filtered = df_filtered.groupby(["HUC_ID", "Metric_Type"], as_index=False)["Value"].mean()

    df_pivot = df_filtered.pivot(index="HUC_ID", columns="Metric_Type", values="Value")
    df_pivot.columns = ["Test KGE", "Test MSE", "Train KGE", "Train_MSE"]
    df_pivot_sorted = df_pivot.sort_index()
    df_selected = df_pivot_sorted[["Test MSE", "Test KGE"]]
    # print(df_selected)
    return df_selected

**Note** To extract metrics from the MLFlow Tracking Server you must be logged in to AWS with access to the MLFlow Server.  If this is not the case, use the upload from local option.  

In [10]:
# set mlFlow tracking server
tracking_uri = "arn:aws:sagemaker:us-west-2:677276086662:mlflow-tracking-server/dawgsML"



# Download Metrics Re: validation set 


In [None]:
# define our run_ids by recognizable names
multi_hucs = "7bc43aac04414e989fb0fb3a244b138e" # lyrical-wren; Validation Set.  30 epochs MandM


In [None]:
df_metrics  = load_ml_metrics(tracking_uri, multi_hucs, save_local=True)
print(df_metrics.shape)
df_metrics.head(2)

# Download Metrics Re: Training Set Predictions

In [11]:
# define our run_ids by recognizable names
train_metrics = "5f37d241bcc540f78a00814bae222ca3" #Auspicious-mare.  Predict on training set 

In [18]:
df_metrics_train = load_ml_metrics(tracking_uri, train_metrics, save_local=False)
print(df_metrics_train.shape)
df_metrics_train.head(2)

(656, 3)


Unnamed: 0,Metric,Step,Value
0,train_kge_171100060303,0,-500.0
1,train_kge_171100090101,0,-500.0


In [14]:
# here "test_kge"/"test_mse" are actually the *training values* so make that adjustment
# Drop rows where Value is -500 (-500 is just a placeholder . . .)
df_metrics_train = df_metrics_train[df_metrics_train["Value"] != -500]
df_metrics_train.head(2)


Unnamed: 0,Metric,Step,Value
2,test_kge_171100060101,0,0.690896
4,test_mse_170103040201,0,0.002256


In [16]:
df_metrics_train["Metric"] = df_metrics_train["Metric"].str.replace("test", "train", regex=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_metrics_train["Metric"] = df_metrics_train["Metric"].str.replace("test", "train", regex=False)


In [17]:
df_metrics_train.head(2)

Unnamed: 0,Metric,Step,Value
2,train_kge_171100060101,0,0.690896
4,train_mse_170103040201,0,0.002256


In [None]:
# Save local
f_out = f"run_id_data/training_set_metrics.csv"
df_metrics_train.to_csv(f_out, index=False)