Copyright Psitron Technologies

#Setting up the API key and installing dependencies

In [None]:
import os
os.environ["ANTHROPIC_API_KEY"] = "your-api-key"  # Replace with your actual key

In [None]:
!pip install mlflow pandas datasets transformers anthropic pyngrok

Tracking URI: You need to make sure that Notebook 2 knows where to find the MLflow tracking server. If you're using a local tracking server (the default), and you started the tracking server in Notebook 1, then Notebook 2 also needs to point to http://localhost:5000. You can set the MLFLOW_TRACKING_URI environment variable in Notebook 2:



In [None]:
import os
os.environ["MLFLOW_TRACKING_URI"] = "https://896e-35-204-243-208.ngrok-free.app" #Put there your ngrok URI

self-contained code for just loading and wrapping the Claude model to MLflow, ready to be executed in a Colab notebook:

In [None]:
import mlflow
import anthropic
import os
import pandas as pd
from typing import Dict, Any, List

# 1. Define the ClaudeModelWrapper class
class ClaudeModelWrapper(mlflow.pyfunc.PythonModel):
    """A custom MLflow model that wraps the Anthropic Claude API."""

    def __init__(self, model_name: str, system_prompt: str):
        self.model_name = model_name
        self.system_prompt = system_prompt
        self.anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY")

    def load_context(self, context: mlflow.pyfunc.model.PythonModelContext) -> None:
        """Loads artifacts (none in this case)."""
        if not self.anthropic_api_key:
            raise ValueError("Anthropic API key not found. Set ANTHROPIC_API_KEY environment variable.")

    def predict(self, context: mlflow.pyfunc.model.PythonModelContext, model_input: pd.DataFrame) -> list[str]:
        """
        Generates predictions using the Claude API.

        Args:
            context: MLflow context (unused).
            model_input: Pandas DataFrame with a 'question' column containing prompts.

        Returns:
            List of responses from Claude.
        """
        client = anthropic.Anthropic()
        responses = []
        for question in model_input["question"]:
            try:
                message = client.messages.create(
                    model=self.model_name,
                    max_tokens=1024,
                    system = self.system_prompt,
                    messages=[
                        {"role": "user", "content": question}
                    ],
                )
                responses.append(message.content[0].text)
            except Exception as e:
                print(f"Error calling Claude API: {e}")
                responses.append(None)  # Or a suitable error value
        return responses

# 2. Set up the Experiment and Tracking
# (Optional, but recommended for organization)
mlflow.set_experiment("claude_wrapping")

# 3. Run
with mlflow.start_run() as run:
    # 4. Define Claude Model and System Prompt
    claude_model_name = "claude-3-opus-20240229"  # Or an appropriate Claude model. Change the model with one that is accesible
    system_prompt = "Answer the following question concisely."

    # 5. Create the ClaudeModelWrapper Instance
    claude_model = ClaudeModelWrapper(model_name=claude_model_name, system_prompt=system_prompt)

    # 6. Log the MLflow Model
    mlflow.pyfunc.log_model(
        python_model=claude_model,
        artifact_path="claude_model",
        input_example=pd.DataFrame({"question": ["What is the capital of France?"]}),
    )

    print(f"Successfully logged Claude model to run: {run.info.run_id}")

In [None]:
import mlflow
import pandas as pd
import anthropic
import os

# 1. Install Dependencies (Run this cell once if you haven't already)
!pip install mlflow anthropic pandas

# 2. Set Your Anthropic API Key (Replace with your actual key!)
# os.environ["ANTHROPIC_API_KEY"] = "your-api-key"  # Replace!

# 3. Configuration
# 4. Define your model run ID
model_run_id = "58868c4b1e1a424d9fb7433a76f2ac0d"

# Set your model and experiment name
claude_model_name = "claude-3-opus-20240229" # @param {type:"string"}
MLFLOW_EXPERIMENT_NAME = "claude_wrapping" # @param {type:"string"}

mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)

# Check is the api key is accesible
if not os.environ.get("ANTHROPIC_API_KEY"):
  raise ValueError("Anthropic API key not found. Set ANTHROPIC_API_KEY environment variable.")

# 5. Load the Model (Replace with the correct Model URI)
model_uri = 'runs:/58868c4b1e1a424d9fb7433a76f2ac0d/claude_model'
loaded_model = mlflow.pyfunc.load_model(model_uri) # Load only once, to save in the test.

# 6. Define the evaluation Function
def get_claude_judgment(question, answer, ground_truth, prompt, claude_model_id):
  """Calls Claude API for evaluation and returns a tuple (score, justification) or (1, "Error") on error."""
  try:
      client = anthropic.Anthropic()
      message = client.messages.create(
          model=claude_model_id,
          max_tokens=300,
          system=prompt,
          messages=[{"role": "user", "content": f"Question: {question}\nAnswer: {answer}\nGround Truth: {ground_truth}"}],
      )
      claude_response = message.content[0].text
      try:  # Safely extract score and justification
          score = int(claude_response.split()[0])
          justification = " ".join(claude_response.split()[1:])
          return score, justification
      except (ValueError, IndexError):
          print(f"Could not parse Claude's response: {claude_response}")  # More specific message
          return 1, "Could not parse Claude's response."

  except Exception as e:
      print(f"Error calling Claude API: {e}")
      return 1, "Error during evaluation."  # Consistent return


def claude_evaluate(model, data, claude_model_id):
  question = data["question"].tolist()
  answers = model.predict(data)
  ground_truth = data["ground_truth"].tolist()
  scores = []
  justifications = []

  for index in range(len(data)):
    score, justification = get_claude_judgment(question = question[index], answer = answers[index], ground_truth = ground_truth[index], prompt = "You will score from 1 to 5 the answer, comparing it with the Ground Truth", claude_model_id = claude_model_id)
    scores.append(score)
    justifications.append(justification)

  result = sum(scores)/len(scores)

  return {"score":result}

# 7. Define Evaluation Data
eval_data = pd.DataFrame({
    "question": [
        "What is MLflow?",
        "What is Spark?",
    ],
    "ground_truth": [
        "MLflow is an open-source platform for managing the end-to-end machine learning (ML) lifecycle.",
        "Apache Spark is a fast, general-purpose cluster computing system for big data processing."
    ]
})

# 8. Evaluate the Model
try:
  with mlflow.start_run() as run: #Create a run for this
    results = claude_evaluate(loaded_model, eval_data, claude_model_name) #added to the load and eval function, with name of Claude

    mlflow.log_metric("mean_score", results["score"]) # Log a metric value
    print(f"Evaluation Results: {results}")

except Exception as e:
    print(f"An error occurred during the MLflow run: {e}")

In [None]:
import mlflow
import pandas as pd
import anthropic
import os

# 2. Set Your Anthropic API Key (Replace with your actual key!)
# os.environ["ANTHROPIC_API_KEY"] = "your-api-key" # Replace!

# 3. Configuration
# 4. Define your model run ID
model_run_id = "58868c4b1e1a424d9fb7433a76f2ac0d" # Fill the run of an older saved model!

# Set your model and experiment name
claude_model_name = "claude-3-opus-20240229" # @param {type:"string"}
MLFLOW_EXPERIMENT_NAME = "claude_wrapping" # @param {type:"string"}

mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)

# Check is the api key is accesible
if not os.environ.get("ANTHROPIC_API_KEY"):
  raise ValueError("Anthropic API key not found. Set ANTHROPIC_API_KEY environment variable.")

# 5. Load the Model (Replace with the correct Model URI)
model_uri = 'runs:/58868c4b1e1a424d9fb7433a76f2ac0d/claude_model'
loaded_model = mlflow.pyfunc.load_model(model_uri) # Load only once, to save in the test.

# 6. Define the evaluation Function
def get_claude_judgment(question, answer, ground_truth, prompt, claude_model_id):
  """Calls Claude API for evaluation and returns a tuple (score, justification) or (1, "Error") on error."""
  try:
      client = anthropic.Anthropic()
      message = client.messages.create(
          model=claude_model_id,
          max_tokens=300,
          system=prompt,
          messages=[{"role": "user", "content": f"Question: {question}\nAnswer: {answer}\nGround Truth: {ground_truth}"}],
      )
      claude_response = message.content[0].text
      try:  # Safely extract score and justification
          score = int(claude_response.split()[0])
          justification = " ".join(claude_response.split()[1:])
          return score, justification
      except (ValueError, IndexError):
          print(f"Could not parse Claude's response: {claude_response}")  # More specific message
          return 1, "Could not parse Claude's response."

  except Exception as e:
      print(f"Error calling Claude API: {e}")
      return 1, "Error during evaluation."  # Consistent return


# 7. Load and eval
def claude_evaluate(eval_data, claude_model_name, loaded_model):

    question = eval_data["question"].tolist()
    ground_truth = eval_data["ground_truth"].tolist()
    scores = []
    justifications = []

    #Here I use the Claude Directly and No call the previous functions, so there will be no issue
    for index in range(len(eval_data)):

        client = anthropic.Anthropic() #Using this fixes the issues!
        try:
          message = client.messages.create(
              model=claude_model_name,
              max_tokens=1024,
              system = "You will evaluate the answer based on the Ground Turth",
              messages=[
                  {"role": "user", "content": f"Question: {question[index]}\n Ground Truth:{ground_truth[index]}"} # Now the System Role is at the top
              ],
          )
          #After having the model, you can get the proper judgement

          score, justification = get_claude_judgment(question = question[index], answer = message.content[0].text, ground_truth = ground_truth[index], prompt = "You will score from 1 to 5 the answer, comparing it with the Ground Truth", claude_model_id = claude_model_name)
          scores.append(score)
          justifications.append(justification)

        except Exception as e:
            print(f"Error calling Claude API: {e}")
            score = 1 #Put the lowest score in case of failure
            justification = "Failure from the bot"
            scores.append(score)
            justifications.append(justification)

    result = sum(scores)/len(scores) #Just compute the final score
    print(f"Justifications: {justifications}")

    return {"score":result}

# 8. Define Evaluation Data
eval_data = pd.DataFrame({
    "question": [
        "What is MLflow?",
        "What is Spark?",
    ],
    "ground_truth": [
        "MLflow is an open-source platform for managing the end-to-end machine learning (ML) lifecycle.",
        "Apache Spark is a fast, general-purpose cluster computing system for big data processing."
    ]
})

try:
  with mlflow.start_run() as run: #Create a run for this
    results = claude_evaluate(eval_data, claude_model_name, loaded_model) #Just need to load

    mlflow.log_metric("mean_score", results["score"]) #Log a metric value
    print(f"Evaluation Results: {results}")

except Exception as e:
    print(f"An error occurred during the MLflow run: {e}")

In [None]:
!pip install mlflow anthropic pandas textstat

In [None]:
import mlflow
import anthropic
import pandas as pd
import os
from typing import List
import textstat  # For readability metrics

# 1. Set Your Anthropic API Key
# os.environ["ANTHROPIC_API_KEY"] = "your-api-key"  # Replace!

# 2. Define Evaluation Data
eval_data = pd.DataFrame(
    {
        "inputs": [
            "What is MLflow?",
            "What is Spark?",
        ],
        "ground_truth": [
            "MLflow is an open-source platform for managing the end-to-end machine learning (ML) lifecycle. It was developed by Databricks, a company that specializes in big data and machine learning solutions. MLflow is designed to address the challenges that data scientists and machine learning engineers face when developing, training, and deploying machine learning models.",
            "Apache Spark is an open-source, distributed computing system designed for big data processing and analytics. It was developed in response to limitations of the Hadoop MapReduce computing model, offering improvements in speed and ease of use. Spark provides libraries for various tasks such as data ingestion, processing, and analysis through its components like Spark SQL for structured data, Spark Streaming for real-time data processing, and MLlib for machine learning tasks",
        ],
    }
)

# 3. Define the Claude Model Function
def anthropic_qa(inputs: pd.DataFrame) -> List[str]:
    """
    Generates responses using the Anthropic Claude API.

    Args:
        inputs: A Pandas DataFrame with an 'inputs' column containing prompts.

    Returns:
        A list of Claude's responses.
    """
    predictions = []
    system_prompt = "Please answer the following question in formal language."
    client = anthropic.Anthropic()

    for _, row in inputs.iterrows():
        try:
            message = client.messages.create(
                model="claude-3-opus-20240229",  # Replace with an available Claude model for you
                max_tokens=1024,
                system= system_prompt,
                messages=[
                    {"role": "user", "content": row["inputs"]},
                ],
            )
            predictions.append(message.content[0].text)
        except Exception as e:
            print(f"Error calling Claude API: {e}")
            predictions.append(None)  # Or a suitable error value

    return predictions

# 4. Define the all the custom metric in just a func
def calculate_metrics(df):

  flesch_kincaid_grade_levels = []
  ari_grade_levels = []
  exact_matches = []

  for _, row in df.iterrows():
        # Readability metrics
        flesch_kincaid_grade_level = textstat.flesch_kincaid_grade(row["predictions"])
        ari_grade_level = textstat.automated_readability_index(row["predictions"])

        flesch_kincaid_grade_levels.append(flesch_kincaid_grade_level)
        ari_grade_levels.append(ari_grade_level)

        # Exact match (basic, case-insensitive)
        exact_match = 1.0 if row["ground_truth"].lower() == row["predictions"].lower() else 0.0
        exact_matches.append(exact_match)

  # Compute the mean, variance, and 90th percentile using list comprehensions
  flesch_kincaid_grade_level_mean = sum(flesch_kincaid_grade_levels) / len(flesch_kincaid_grade_levels)
  flesch_kincaid_grade_level_variance = sum([(x - flesch_kincaid_grade_level_mean) ** 2 for x in flesch_kincaid_grade_levels]) / len(flesch_kincaid_grade_levels) # Compute unbiased Variance
  flesch_kincaid_grade_level_p90 = sorted(flesch_kincaid_grade_levels)[int(0.9 * len(flesch_kincaid_grade_levels))]

  ari_grade_level_mean = sum(ari_grade_levels) / len(ari_grade_levels)
  ari_grade_level_variance = sum([(x - ari_grade_level_mean) ** 2 for x in ari_grade_levels]) / len(ari_grade_levels) # Compute unbiased Variance
  ari_grade_level_p90 = sorted(ari_grade_levels)[int(0.9 * len(ari_grade_levels))]

  exact_match_mean = sum(exact_matches) / len(exact_matches)

  return {
          "flesch_kincaid_grade_level/v1/mean": flesch_kincaid_grade_level_mean,
          "flesch_kincaid_grade_level/v1/variance": flesch_kincaid_grade_level_variance,
          "flesch_kincaid_grade_level/v1/p90": flesch_kincaid_grade_level_p90,
          "ari_grade_level/v1/mean": ari_grade_level_mean,
          "ari_grade_level/v1/variance": ari_grade_level_variance,
          "ari_grade_level/v1/p90": ari_grade_level_p90,
          "exact_match/v1": exact_match_mean
      }

# 5. Run the evaluation
with mlflow.start_run():
    # Generate Claude model outputs
    predictions = anthropic_qa(eval_data)
    eval_data["predictions"] = predictions

    metrics = calculate_metrics(eval_data) # Calculate the metrics on the new predictions

    # Log results
    mlflow.log_metrics(metrics)

    print(f"Metrics: {metrics}")
    print(eval_data)  # Display the dataframe with the responses and metrics

In [None]:
import mlflow
import anthropic
import pandas as pd
import os
from typing import List
import textstat  # For readability metrics
import re

# 1. Set Your Anthropic API Key
# os.environ["ANTHROPIC_API_KEY"] = "your-api-key"  # Replace!

# 2. Define Evaluation Data
eval_data = pd.DataFrame(
    {
        "inputs": [
            "What is MLflow?",
            "What is Spark?",
        ],
        "ground_truth": [
            "MLflow is an open-source platform for managing the end-to-end machine learning (ML) lifecycle. It was developed by Databricks, a company that specializes in big data and machine learning solutions. MLflow is designed to address the challenges that data scientists and machine learning engineers face when developing, training, and deploying machine learning models.",
            "Apache Spark is an open-source, distributed computing system designed for big data processing and analytics. It was developed in response to limitations of the Hadoop MapReduce computing model, offering improvements in speed and ease of use. Spark provides libraries for various tasks such as data ingestion, processing, and analysis through its components like Spark SQL for structured data, Spark Streaming for real-time data processing, and MLlib for machine learning tasks",
        ],
    }
)

# 3. Define the Claude Model Function
def anthropic_qa(inputs: pd.DataFrame) -> List[str]:
    """
    Generates responses using the Anthropic Claude API.

    Args:
        inputs: A Pandas DataFrame with an 'inputs' column containing prompts.

    Returns:
        A list of Claude's responses.
    """
    predictions = []
    system_prompt = "Please answer the following question in formal language."
    client = anthropic.Anthropic()

    for _, row in inputs.iterrows():
        try:
            message = client.messages.create(
                model="claude-3-opus-20240229",  # Replace with an available Claude model for you
                max_tokens=1024,
                system= system_prompt,
                messages=[
                    {"role": "user", "content": row["inputs"]},
                ],
            )
            predictions.append(message.content[0].text)
        except Exception as e:
            print(f"Error calling Claude API: {e}")
            predictions.append(None)  # Or a suitable error value

    return predictions

# 4. Define the all the custom metric in just a func
def calculate_metrics(df):

  flesch_kincaid_grade_levels = []
  ari_grade_levels = []
  exact_matches = []

  def normalize_text(text):
      text = text.lower().strip()  # Convert to lowercase and remove whitespace
      text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
      return text

  for _, row in df.iterrows():
        # Readability metrics
        flesch_kincaid_grade_level = textstat.flesch_kincaid_grade(row["predictions"])
        ari_grade_level = textstat.automated_readability_index(row["predictions"])

        flesch_kincaid_grade_levels.append(flesch_kincaid_grade_level)
        ari_grade_levels.append(ari_grade_level)

        # Exact match (basic, case-insensitive)
        normalized_ground_truth = normalize_text(row['ground_truth'])
        normalized_prediction = normalize_text(row['predictions'])
        exact_match = 1.0 if normalized_ground_truth == normalized_prediction else 0.0
        exact_matches.append(exact_match)

  # Compute the mean, variance, and 90th percentile using list comprehensions
  flesch_kincaid_grade_level_mean = sum(flesch_kincaid_grade_levels) / len(flesch_kincaid_grade_levels)
  flesch_kincaid_grade_level_variance = sum([(x - flesch_kincaid_grade_level_mean) ** 2 for x in flesch_kincaid_grade_levels]) / len(flesch_kincaid_grade_levels) # Compute unbiased Variance
  flesch_kincaid_grade_level_p90 = sorted(flesch_kincaid_grade_levels)[int(0.9 * len(flesch_kincaid_grade_levels))]

  ari_grade_level_mean = sum(ari_grade_levels) / len(ari_grade_levels)
  ari_grade_level_variance = sum([(x - ari_grade_level_mean) ** 2 for x in ari_grade_levels]) / len(ari_grade_levels) # Compute unbiased Variance
  ari_grade_level_p90 = sorted(ari_grade_levels)[int(0.9 * len(ari_grade_levels))]

  exact_match_mean = sum(exact_matches) / len(exact_matches)

  return {
          "flesch_kincaid_grade_level/v1/mean": flesch_kincaid_grade_level_mean,
          "flesch_kincaid_grade_level/v1/variance": flesch_kincaid_grade_level_variance,
          "flesch_kincaid_grade_level/v1/p90": flesch_kincaid_grade_level_p90,
          "ari_grade_level/v1/mean": ari_grade_level_mean,
          "ari_grade_level/v1/variance": ari_grade_level_variance,
          "ari_grade_level/v1/p90": ari_grade_level_p90,
          "exact_match/v1": exact_match_mean
      }

# 5. Run the evaluation
with mlflow.start_run():
    # Generate Claude model outputs
    predictions = anthropic_qa(eval_data)
    eval_data["predictions"] = predictions

    metrics = calculate_metrics(eval_data)  # Calculate the metrics on the new predictions

    # Log results
    mlflow.log_metrics(metrics)

    print(f"Metrics: {metrics}")
    print(eval_data)  # Display the dataframe with the responses and metrics

In [None]:
import mlflow
import pandas as pd
from mlflow.metrics import make_metric

# 1. Define Evaluation Data (with pre-computed predictions)
eval_data = pd.DataFrame({
    "inputs": [
        "What is MLflow?",
        "What is Spark?",
    ],
    "ground_truth": [
        "MLflow is an open-source platform for managing the end-to-end machine learning (ML) lifecycle.",
        "Apache Spark is a fast, general-purpose cluster computing system for big data processing."
    ],
    "predictions": [ # Model is not run, so you need to load the answers into the model output
        "MLflow is a platform for managing the ML lifecycle.",
        "Spark is a fast, general-purpose cluster computing system."
    ]
})

# 2. Create a simple, basic metrics
def simple_metric(row):
        if row["ground_truth"].lower() in row["predictions"].lower(): # comparing if the ground_truth is in the response
            return 1.0
        else:
            return 0.0

# 3. Start MLflow and make the evaluation
with mlflow.start_run() as run:
    eval_data["metric"] = eval_data.apply(simple_metric, axis=1)  # Apply the metric to each row
    average_metric = eval_data["metric"].mean()  # calculate the average of the metric

    # Log results
    mlflow.log_metric("simple_matching_accuracy", average_metric) # Log the simple matching accuracy
    print(f"Simple Matching Accuracy: {average_metric}")

    print(eval_data)  # Display the dataframe with the responses and metrics

In [None]:
import mlflow
import anthropic
import pandas as pd
import os
from typing import List
import textstat  # For readability metrics
import re
from mlflow.data import from_pandas
import tempfile #Import temporal directory

# 1. Set Your Anthropic API Key
# os.environ["ANTHROPIC_API_KEY"] = "your-api-key"  # Replace!

# 2. Define Evaluation Data
eval_data = pd.DataFrame(
    {
        "inputs": [
            "What is MLflow?",
            "What is Spark?",
        ],
        "ground_truth": [
            "MLflow is an open-source platform for managing the end-to-end machine learning (ML) lifecycle. It was developed by Databricks, a company that specializes in big data and machine learning solutions. MLflow is designed to address the challenges that data scientists and machine learning engineers face when developing, training, and deploying machine learning models.",
            "Apache Spark is an open-source, distributed computing system designed for big data processing and analytics. It was developed in response to limitations of the Hadoop MapReduce computing model, offering improvements in speed and ease of use. Spark provides libraries for various tasks such as data ingestion, processing, and analysis through its components like Spark SQL for structured data, Spark Streaming for real-time data processing, and MLlib for machine learning tasks",
        ],
    }
)

# 3. Define the Claude Model Function
def anthropic_qa(inputs: pd.DataFrame) -> List[str]:
    """
    Generates responses using the Anthropic Claude API.

    Args:
        inputs: A Pandas DataFrame with an 'inputs' column containing prompts.

    Returns:
        A list of Claude's responses.
    """
    predictions = []
    system_prompt = "Please answer the following question in formal language."
    client = anthropic.Anthropic()

    for _, row in inputs.iterrows():
        try:
            message = client.messages.create(
                model="claude-3-opus-20240229",  # Replace with an available Claude model for you
                max_tokens=1024,
                system= system_prompt,
                messages=[
                    {"role": "user", "content": row["inputs"]},
                ],
            )
            predictions.append(message.content[0].text)
        except Exception as e:
            print(f"Error calling Claude API: {e}")
            predictions.append(None)  # Or a suitable error value

    return predictions

# 4. Define the all the custom metric in just a func
def calculate_metrics(df):

  flesch_kincaid_grade_levels = []
  ari_grade_levels = []
  exact_matches = []

  def normalize_text(text):
      text = text.lower().strip()  # Convert to lowercase and remove whitespace
      text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
      return text

  for _, row in df.iterrows():
        # Readability metrics
        flesch_kincaid_grade_level = textstat.flesch_kincaid_grade(row["predictions"])
        ari_grade_level = textstat.automated_readability_index(row["predictions"])

        flesch_kincaid_grade_levels.append(flesch_kincaid_grade_level)
        ari_grade_levels.append(ari_grade_level)

        # Exact match (basic, case-insensitive)
        normalized_ground_truth = normalize_text(row['ground_truth'])
        normalized_prediction = normalize_text(row['predictions'])
        exact_match = 1.0 if normalized_ground_truth == normalized_prediction else 0.0
        exact_matches.append(exact_match)

  # Compute the mean, variance, and 90th percentile using list comprehensions
  flesch_kincaid_grade_level_mean = sum(flesch_kincaid_grade_levels) / len(flesch_kincaid_grade_levels)
  flesch_kincaid_grade_level_variance = sum([(x - flesch_kincaid_grade_level_mean) ** 2 for x in flesch_kincaid_grade_levels]) / len(flesch_kincaid_grade_levels) # Compute unbiased Variance
  flesch_kincaid_grade_level_p90 = sorted(flesch_kincaid_grade_levels)[int(0.9 * len(flesch_kincaid_grade_levels))]

  ari_grade_level_mean = sum(ari_grade_levels) / len(ari_grade_levels)
  ari_grade_level_variance = sum([(x - ari_grade_level_mean) ** 2 for x in ari_grade_levels]) / len(ari_grade_levels) # Compute unbiased Variance
  ari_grade_level_p90 = sorted(ari_grade_levels)[int(0.9 * len(ari_grade_levels))]

  exact_match_mean = sum(exact_matches) / len(exact_matches)

  return {
          "flesch_kincaid_grade_level/v1/mean": flesch_kincaid_grade_level_mean,
          "flesch_kincaid_grade_level/v1/variance": flesch_kincaid_grade_level_variance,
          "flesch_kincaid_grade_level/v1/p90": flesch_kincaid_grade_level_p90,
          "ari_grade_level/v1/mean": ari_grade_level_mean,
          "ari_grade_level/v1/variance": ari_grade_level_variance,
          "ari_grade_level/v1/p90": ari_grade_level_p90,
          "exact_match/v1": exact_match_mean
      }

# 5. Run the evaluation
with mlflow.start_run() as run:
    # Generate Claude model outputs
    predictions = anthropic_qa(eval_data)
    eval_data["predictions"] = predictions

    metrics = calculate_metrics(eval_data)  # Calculate the metrics on the new predictions

    # Log results
    mlflow.log_metrics(metrics)

    print(f"Metrics: {metrics}")
    print(eval_data)  # Display the dataframe with the responses and metrics
# Temp csv save + upload
    with tempfile.TemporaryDirectory() as tmpdir:
        csv_path = os.path.join(tmpdir, "eval_results.csv")
        eval_data.to_csv(csv_path, index=False)  # Save without index
        mlflow.log_artifact(csv_path, artifact_path="eval_results_table")