In [2]:
import mlflow
import openai
import os
import pandas as pd
from getpass import getpass
from mlflow.metrics import latency
from mlflow.metrics.genai import answer_correctness

client = openai.OpenAI(
    api_key=os.getenv('OPENAI_API_KEY'),
    base_url=os.getenv('OPENAI_BASE_URL')
)

eval_data = pd.DataFrame(
    {
        "inputs": [
            "What is MLflow?",
            "What is Spark?",
        ],
        "ground_truth": [
            "MLflow is an open-source platform for managing the end-to-end machine learning (ML) "
            "lifecycle. It was developed by Databricks, a company that specializes in big data and "
            "machine learning solutions. MLflow is designed to address the challenges that data "
            "scientists and machine learning engineers face when developing, training, and deploying "
            "machine learning models.",
            "Apache Spark is an open-source, distributed computing system designed for big data "
            "processing and analytics. It was developed in response to limitations of the Hadoop "
            "MapReduce computing model, offering improvements in speed and ease of use. Spark "
            "provides libraries for various tasks such as data ingestion, processing, and analysis "
            "through its components like Spark SQL for structured data, Spark Streaming for "
            "real-time data processing, and MLlib for machine learning tasks",
        ],
    }
)

with mlflow.start_run() as run:
    system_prompt = "Answer the following question in two sentences"
    # Wrap "gpt-4" as an MLflow model.
    logged_model_info = mlflow.openai.log_model(
        model="gpt-4",
        task=openai.chat.completions,
        name="model",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": "{question}"},
        ],
    )

    # Use predefined question-answering metrics to evaluate our model.
    results = mlflow.evaluate(
        logged_model_info.model_uri,
        eval_data,
        targets="ground_truth",
        model_type="question-answering",
        extra_metrics=[
            answer_correctness(),
            latency(),
        ],
    )
    print(f"See aggregated evaluation results below: \n{results.metrics}")

    # Evaluation result for each data record is available in `results.tables`.
    eval_table = results.tables["eval_results_table"]
    print(f"See evaluation table below: \n{eval_table}")

Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 524.83it/s]
2025/07/06 17:08:46 INFO mlflow.tracking.fluent: Active model is set to the logged model with ID: m-94ea81a0b1994bb4a4917fc22bdc87d5
2025/07/06 17:08:46 INFO mlflow.tracking.fluent: Use `mlflow.set_active_model` to set the active model to a different one if needed.
2025/07/06 17:08:46 INFO mlflow.tracking.fluent: Active model is set to the logged model with ID: m-94ea81a0b1994bb4a4917fc22bdc87d5
2025/07/06 17:08:46 INFO mlflow.tracking.fluent: Use `mlflow.set_active_model` to set the active model to a different one if needed.
2025/07/06 17:08:46 INFO mlflow.models.evaluation.evaluators.default: Computing model predictions.
2025/07/06 17:08:51 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
100%|██████████| 1/1 [00:05<00:00,  5.23s/it]
100%|██████████| 2/2 [00:05<00:00,  2.70s/it]


See aggregated evaluation results below: 
{'latency/mean': 2.387356162071228, 'latency/variance': 0.01065148637771074, 'latency/p90': 2.4699209928512573, 'exact_match/v1': 0.0, 'answer_correctness/v1/mean': 4.5, 'answer_correctness/v1/variance': 0.25, 'answer_correctness/v1/p90': 4.9}


Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 215.95it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 273.08it/s]

See evaluation table below: 
            inputs                                       ground_truth  \
0  What is MLflow?  MLflow is an open-source platform for managing...   
1   What is Spark?  Apache Spark is an open-source, distributed co...   

                                             outputs   latency  token_count  \
0  MLflow is an open-source platform developed by...  2.490562           54   
1  Spark is an open-source distributed general-pu...  2.284150           31   

   answer_correctness/v1/score  \
0                            5   
1                            4   

                 answer_correctness/v1/justification  
0  The output provided by the model is correct. I...  
1  The output provided by the model is mostly cor...  
🏃 View run sincere-fawn-947 at: http://127.0.0.1:5001/#/experiments/0/runs/7cecbc79c558404d9c5cd2da7bd37cf8
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/0





In [3]:
professionalism_example_score_2 = mlflow.metrics.genai.EvaluationExample(
    input="What is MLflow?",
    output=(
        "MLflow is like your friendly neighborhood toolkit for managing your machine learning projects. It helps "
        "you track experiments, package your code and models, and collaborate with your team, making the whole ML "
        "workflow smoother. It's like your Swiss Army knife for machine learning!"
    ),
    score=2,
    justification=(
        "The response is written in a casual tone. It uses contractions, filler words such as 'like', and "
        "exclamation points, which make it sound less professional. "
    ),
)
professionalism_example_score_4 = mlflow.metrics.genai.EvaluationExample(
    input="What is MLflow?",
    output=(
        "MLflow is an open-source platform for managing the end-to-end machine learning (ML) lifecycle. It was "
        "developed by Databricks, a company that specializes in big data and machine learning solutions. MLflow is "
        "designed to address the challenges that data scientists and machine learning engineers face when "
        "developing, training, and deploying machine learning models.",
    ),
    score=4,
    justification=("The response is written in a formal language and a neutral tone. "),
)


professionalism = mlflow.metrics.genai.make_genai_metric(
    name="professionalism",
    definition=(
        "Professionalism refers to the use of a formal, respectful, and appropriate style of communication that is "
        "tailored to the context and audience. It often involves avoiding overly casual language, slang, or "
        "colloquialisms, and instead using clear, concise, and respectful language."
    ),
    grading_prompt=(
        "Professionalism: If the answer is written using a professional tone, below are the details for different scores: "
        "- Score 0: Language is extremely casual, informal, and may include slang or colloquialisms. Not suitable for "
        "professional contexts."
        "- Score 1: Language is casual but generally respectful and avoids strong informality or slang. Acceptable in "
        "some informal professional settings."
        "- Score 2: Language is overall formal but still have casual words/phrases. Borderline for professional contexts."
        "- Score 3: Language is balanced and avoids extreme informality or formality. Suitable for most professional contexts. "
        "- Score 4: Language is noticeably formal, respectful, and avoids casual elements. Appropriate for formal "
        "business or academic settings. "
    ),
    examples=[professionalism_example_score_2, professionalism_example_score_4],
    model="openai:/gpt-4o-mini",
    parameters={"temperature": 0.0},
    aggregations=["mean", "variance"],
    greater_is_better=True,
)

In [5]:
client = openai.OpenAI(
    api_key=os.getenv('OPENAI_API_KEY'),
    base_url=os.getenv('OPENAI_BASE_URL')
)

eval_data = pd.DataFrame(
    {
        "inputs": [
            "What is MLflow?",
            "What is Spark?",
        ],
        "ground_truth": [
            "MLflow is an open-source platform for managing the end-to-end machine learning (ML) "
            "lifecycle. It was developed by Databricks, a company that specializes in big data and "
            "machine learning solutions. MLflow is designed to address the challenges that data "
            "scientists and machine learning engineers face when developing, training, and deploying "
            "machine learning models.",
            "Apache Spark is an open-source, distributed computing system designed for big data "
            "processing and analytics. It was developed in response to limitations of the Hadoop "
            "MapReduce computing model, offering improvements in speed and ease of use. Spark "
            "provides libraries for various tasks such as data ingestion, processing, and analysis "
            "through its components like Spark SQL for structured data, Spark Streaming for "
            "real-time data processing, and MLlib for machine learning tasks",
        ],
    }
)

with mlflow.start_run() as run:
    system_prompt = "Answer the following question in two sentences"
    # Wrap "gpt-4" as an MLflow model.
    logged_model_info = mlflow.openai.log_model(
        model="gpt-4",
        task=openai.chat.completions,
        name="model",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": "{question}"},
        ],
    )

    # Use predefined question-answering metrics to evaluate our model.
    results = mlflow.evaluate(
        logged_model_info.model_uri,
        eval_data,
        targets="ground_truth",
        model_type="question-answering",
        extra_metrics=[
            answer_correctness(),
            latency(),
            professionalism
        ],
    )
    print(f"See aggregated evaluation results below: \n{results.metrics}")

    # Evaluation result for each data record is available in `results.tables`.
    eval_table = results.tables["eval_results_table"]
    print(f"See evaluation table below: \n{eval_table}")

Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 444.48it/s]
2025/07/06 17:22:50 INFO mlflow.tracking.fluent: Active model is set to the logged model with ID: m-193fa6f33ec24ebeb48ebdc821a9f321
2025/07/06 17:22:50 INFO mlflow.tracking.fluent: Use `mlflow.set_active_model` to set the active model to a different one if needed.
2025/07/06 17:22:50 INFO mlflow.tracking.fluent: Active model is set to the logged model with ID: m-193fa6f33ec24ebeb48ebdc821a9f321
2025/07/06 17:22:50 INFO mlflow.tracking.fluent: Use `mlflow.set_active_model` to set the active model to a different one if needed.
2025/07/06 17:22:50 INFO mlflow.models.evaluation.evaluators.default: Computing model predictions.
2025/07/06 17:22:56 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
100%|██████████| 1/1 [00:03<00:00,  3.91s/it]
100%|██████████| 1/1 [00:01<00:00,  1.80s/it]
100%|██████████| 2/2 [00:05<00:00,  2.57s/it]
100%|██████████| 2/2 [00:02<00:00,  1.08s/it]


See aggregated evaluation results below: 
{'latency/mean': 2.6847835779190063, 'latency/variance': 0.3293514773020121, 'latency/p90': 3.143896794319153, 'exact_match/v1': 0.0, 'answer_correctness/v1/mean': 4.5, 'answer_correctness/v1/variance': 0.25, 'answer_correctness/v1/p90': 4.9, 'professionalism/v1/mean': 4.0, 'professionalism/v1/variance': 0.0}


Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 225.79it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 252.26it/s]

See evaluation table below: 
            inputs                                       ground_truth  \
0  What is MLflow?  MLflow is an open-source platform for managing...   
1   What is Spark?  Apache Spark is an open-source, distributed co...   

                                             outputs   latency  token_count  \
0  MLflow is an open-source platform developed by...  3.258675           55   
1  Spark is an open-source, distributed computing...  2.110892           41   

   answer_correctness/v1/score  \
0                            5   
1                            4   

                 answer_correctness/v1/justification  \
0  The output provided by the model is correct. I...   
1  The output provided by the model is mostly cor...   

   professionalism/v1/score                   professionalism/v1/justification  
0                         4  The response is written in a formal and respec...  
1                         4  The response is written in a formal and respec... 


