In [10]:
import mlflow
import openai
import os
import pandas as pd
from getpass import getpass
from mlflow.metrics import latency
from mlflow.metrics.genai import answer_correctness

client = openai.OpenAI(
    api_key=os.getenv('OPENAI_API_KEY'),
    base_url=os.getenv('OPENAI_BASE_URL')
)

eval_data = pd.DataFrame(
    {
        "inputs": [
            "What is MLflow?",
            "What is Spark?",
        ],
        "ground_truth": [
            "MLflow is an open-source platform for managing the end-to-end machine learning (ML) "
            "lifecycle. It was developed by Databricks, a company that specializes in big data and "
            "machine learning solutions. MLflow is designed to address the challenges that data "
            "scientists and machine learning engineers face when developing, training, and deploying "
            "machine learning models.",
            "Apache Spark is an open-source, distributed computing system designed for big data "
            "processing and analytics. It was developed in response to limitations of the Hadoop "
            "MapReduce computing model, offering improvements in speed and ease of use. Spark "
            "provides libraries for various tasks such as data ingestion, processing, and analysis "
            "through its components like Spark SQL for structured data, Spark Streaming for "
            "real-time data processing, and MLlib for machine learning tasks",
        ],
    }
)

system_prompt = mlflow.genai.register_prompt(
    name="gpt-4-system-prompot",
    template="Answer the following question in two sentences",
    commit_message="Initial version of chatbot",
)

with mlflow.start_run() as run:
    # system_prompt = "Answer the following question in two sentences"
    # Wrap "gpt-4" as an MLflow model.
    logged_model_info = mlflow.openai.log_model(
        model="gpt-4",
        task=openai.chat.completions,
        name="model",
        messages=[
            {"role": "system", "content": system_prompt.template},
            {"role": "user", "content": "{question}"},
        ],
    )

    # Use predefined question-answering metrics to evaluate our model.
    results = mlflow.evaluate(
        logged_model_info.model_uri,
        eval_data,
        targets="ground_truth",
        model_type="question-answering",
        extra_metrics=[
            answer_correctness(),
            latency(),
        ],
    )
    print(f"See aggregated evaluation results below: \n{results.metrics}")

    # Evaluation result for each data record is available in `results.tables`.
    eval_table = results.tables["eval_results_table"]
    print(f"See evaluation table below: \n{eval_table}")

2025/07/06 22:14:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for prompt version to finish creation. Prompt name: gpt-4-system-prompot, version 1
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 687.37it/s]
2025/07/06 22:14:37 INFO mlflow.tracking.fluent: Active model is set to the logged model with ID: m-a2ccca652c274e6685470d3843337b06
2025/07/06 22:14:37 INFO mlflow.tracking.fluent: Use `mlflow.set_active_model` to set the active model to a different one if needed.
2025/07/06 22:14:37 INFO mlflow.models.evaluation.evaluators.default: Computing model predictions.
2025/07/06 22:14:42 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
100%|██████████| 1/1 [00:04<00:00,  4.05s/it]
100%|██████████| 2/2 [00:04<00:00,  2.32s/it]


See aggregated evaluation results below: 
{'latency/mean': 2.570592164993286, 'latency/variance': 0.023621233035612477, 'latency/p90': 2.693545770645142, 'exact_match/v1': 0.0, 'answer_correctness/v1/mean': 4.5, 'answer_correctness/v1/variance': 0.25, 'answer_correctness/v1/p90': 4.9}


Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 292.22it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 272.09it/s]

See evaluation table below: 
            inputs                                       ground_truth  \
0  What is MLflow?  MLflow is an open-source platform for managing...   
1   What is Spark?  Apache Spark is an open-source, distributed co...   

                                             outputs   latency  token_count  \
0  MLflow is an open-source platform designed to ...  2.724284           51   
1  Spark is an open-source, distributed computing...  2.416900           34   

   answer_correctness/v1/score  \
0                            5   
1                            4   

                 answer_correctness/v1/justification  
0  The output provided by the model is correct. I...  
1  The output provided by the model is mostly cor...  
🏃 View run secretive-skink-96 at: http://127.0.0.1:5001/#/experiments/483845778527648610/runs/50f8ad68cb664c34a311742f27473145
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/483845778527648610





In [11]:
professionalism_example_score_2 = mlflow.metrics.genai.EvaluationExample(
    input="What is MLflow?",
    output=(
        "MLflow is like your friendly neighborhood toolkit for managing your machine learning projects. It helps "
        "you track experiments, package your code and models, and collaborate with your team, making the whole ML "
        "workflow smoother. It's like your Swiss Army knife for machine learning!"
    ),
    score=2,
    justification=(
        "The response is written in a casual tone. It uses contractions, filler words such as 'like', and "
        "exclamation points, which make it sound less professional. "
    ),
)
professionalism_example_score_4 = mlflow.metrics.genai.EvaluationExample(
    input="What is MLflow?",
    output=(
        "MLflow is an open-source platform for managing the end-to-end machine learning (ML) lifecycle. It was "
        "developed by Databricks, a company that specializes in big data and machine learning solutions. MLflow is "
        "designed to address the challenges that data scientists and machine learning engineers face when "
        "developing, training, and deploying machine learning models.",
    ),
    score=4,
    justification=("The response is written in a formal language and a neutral tone. "),
)


professionalism = mlflow.metrics.genai.make_genai_metric(
    name="professionalism",
    definition=(
        "Professionalism refers to the use of a formal, respectful, and appropriate style of communication that is "
        "tailored to the context and audience. It often involves avoiding overly casual language, slang, or "
        "colloquialisms, and instead using clear, concise, and respectful language."
    ),
    grading_prompt=(
        "Professionalism: If the answer is written using a professional tone, below are the details for different scores: "
        "- Score 0: Language is extremely casual, informal, and may include slang or colloquialisms. Not suitable for "
        "professional contexts."
        "- Score 1: Language is casual but generally respectful and avoids strong informality or slang. Acceptable in "
        "some informal professional settings."
        "- Score 2: Language is overall formal but still have casual words/phrases. Borderline for professional contexts."
        "- Score 3: Language is balanced and avoids extreme informality or formality. Suitable for most professional contexts. "
        "- Score 4: Language is noticeably formal, respectful, and avoids casual elements. Appropriate for formal "
        "business or academic settings. "
    ),
    examples=[professionalism_example_score_2, professionalism_example_score_4],
    model="openai:/gpt-4o-mini",
    parameters={"temperature": 0.0},
    aggregations=["mean", "variance"],
    greater_is_better=True,
)

In [None]:
# system_prompt = mlflow.genai.register_prompt(
#     name="gpt-4-system-prompot",
#     template="Answer the following question in two sentences",
#     commit_message="Initial version of chatbot",
# )

In [20]:
import mlflow.openai
import openai


client = openai.OpenAI(
    api_key=os.getenv('OPENAI_API_KEY'),
    base_url=os.getenv('OPENAI_BASE_URL')
)

eval_data = pd.DataFrame(
    {
        "inputs": [
            "What is MLflow?",
            "What is Spark?",
        ],
        "ground_truth": [
            "MLflow is an open-source platform for managing the end-to-end machine learning (ML) "
            "lifecycle. It was developed by Databricks, a company that specializes in big data and "
            "machine learning solutions. MLflow is designed to address the challenges that data "
            "scientists and machine learning engineers face when developing, training, and deploying "
            "machine learning models.",
            "Apache Spark is an open-source, distributed computing system designed for big data "
            "processing and analytics. It was developed in response to limitations of the Hadoop "
            "MapReduce computing model, offering improvements in speed and ease of use. Spark "
            "provides libraries for various tasks such as data ingestion, processing, and analysis "
            "through its components like Spark SQL for structured data, Spark Streaming for "
            "real-time data processing, and MLlib for machine learning tasks",
        ],
    }
)


mlflow.set_experiment("my-prompt-app")
mlflow.set_active_model(name="gpt-4")
mlflow.openai.autolog()

prompt_version = "prompts:/gpt-4-system-prompot/3"

prompt = mlflow.genai.load_prompt(prompt_version)


with mlflow.start_run() as run:
    # Wrap "gpt-4" as an MLflow model.
    logged_model_info = mlflow.openai.log_model(
        model="gpt-4",
        task=openai.chat.completions,
        name="gpt-4",
        # prompts=[system_prompt],
        prompts=[prompt_version],
        messages=[
            {"role": "system", "content": prompt.template},
            {"role": "user", "content": "{question}"},
        ],
    )

    # Use predefined question-answering metrics to evaluate our model.
    results = mlflow.evaluate(
        logged_model_info.model_uri,
        eval_data,
        targets="ground_truth",
        model_type="question-answering",
        extra_metrics=[
            answer_correctness(),
            latency(),
            professionalism
        ],
    )
    print(f"See aggregated evaluation results below: \n{results.metrics}")

    # Evaluation result for each data record is available in `results.tables`.
    eval_table = results.tables["eval_results_table"]
    print(f"See evaluation table below: \n{eval_table}")

2025/07/06 22:30:49 INFO mlflow.tracking.fluent: Active model is set to the logged model with ID: m-1b68c33cd82b44f2bf02fff8f894c385
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 696.17it/s]
2025/07/06 22:30:51 INFO mlflow.tracking.fluent: Active model is set to the logged model with ID: m-840c4406366948d397e88fc2b95798b9
2025/07/06 22:30:51 INFO mlflow.tracking.fluent: Use `mlflow.set_active_model` to set the active model to a different one if needed.
2025/07/06 22:30:51 INFO mlflow.models.evaluation.evaluators.default: Computing model predictions.
2025/07/06 22:30:52 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
100%|██████████| 1/1 [00:00<00:00,  1.69it/s]
100%|██████████| 1/1 [00:00<00:00,  1.69it/s]
100%|██████████| 2/2 [00:00<00:00,  3.30it/s]
100%|██████████| 2/2 [00:00<00:00,  3.23it/s]


See aggregated evaluation results below: 
{'latency/mean': 0.6516404151916504, 'latency/variance': 3.064817519771168e-07, 'latency/p90': 0.6520833015441895, 'exact_match/v1': 0.0, 'answer_correctness/v1/mean': 5.0, 'answer_correctness/v1/variance': 0.0, 'answer_correctness/v1/p90': 5.0, 'professionalism/v1/mean': 4.0, 'professionalism/v1/variance': 0.0}


Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 209.16it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 237.13it/s]

See evaluation table below: 
            inputs                                       ground_truth  \
0  What is MLflow?  MLflow is an open-source platform for managing...   
1   What is Spark?  Apache Spark is an open-source, distributed co...   

                                             outputs   latency  token_count  \
0  MLflow is an open-source platform developed by...  0.651087           58   
1  Apache Spark is an open-source, distributed co...  0.652194           48   

   answer_correctness/v1/score  \
0                            5   
1                            5   

                 answer_correctness/v1/justification  \
0  The output provided by the model is correct. I...   
1  The output provided by the model is correct. I...   

   professionalism/v1/score                   professionalism/v1/justification  
0                         4  The response is written in a formal and respec...  
1                         4  The response is written in a formal and respec... 


