In [5]:
import openai
import pandas as pd
import mlflow
from dotenv import load_dotenv
from mlflow.metrics.genai import (
    answer_similarity,
    answer_relevance,
    answer_correctness,
)
import warnings

In [11]:
warnings.filterwarnings("ignore")
load_dotenv("myenv/.env")
openai.api_type = "openai"

## Scenario 1: Generating responses from the model and then evaluating them

In [6]:
eval_df = pd.DataFrame(
    {
        "inputs": [
            "How does useEffect() work?",
            "What does the static keyword in a function mean?",
            "What does the 'finally' block in Python do?",
            "What is the difference between multiprocessing and multithreading?",
        ],
        "ground_truth": [
            "The useEffect() hook tells React that your component needs to do something after render. React will remember the function you passed (we’ll refer to it as our “effect”), and call it later after performing the DOM updates.",
            "Static members belongs to the class, rather than a specific instance. This means that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any. It will be shared by all objects.",
            "'Finally' defines a block of code to run when the try... except...else block is final. The finally block will be executed no matter if the try block raises an error or not.",
            "Multithreading refers to the ability of a processor to execute multiple threads concurrently, where each thread runs a process. Whereas multiprocessing refers to the ability of a system to run multiple processors in parallel, where each processor can run one or more threads.",
        ],
    }
)

In [9]:
with mlflow.start_run() as run:
    system_prompt = "Answer the following question in two sentences"
    basic_qa_model = mlflow.openai.log_model(
        model="gpt-4o-mini",
        task=openai.chat.completions,
        artifact_path="model",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": "{question}"},
        ],
    )
    results = mlflow.evaluate(
        basic_qa_model.model_uri,
        eval_df,
        targets="ground_truth",  # specify which column corresponds to the expected output
        model_type="question-answering",  # model type indicates which metrics are relevant for this task
        evaluators="default",
        extra_metrics=[
            mlflow.metrics.latency(),
            mlflow.metrics.exact_match(),
            answer_similarity(),
            answer_relevance(),
            answer_correctness(),
        ],  # use the answer similarity metric created above
    )

2024/10/24 13:48:24 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/10/24 13:48:29 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
Using default facebook/roberta-hate-speech-dynabench-r4-target checkpoint
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
100%|██████████| 1/1 [00:05<00:00,  5.06s/it]
100%|██████████| 1/1 [00:04<00:00,  4.08s/it]
100%|██████████| 1/1 [00:04<00:00,  4.80s/it]
100%|██████████| 4/4 [00:04<00:00,  1.21s/it]
100%|██████████| 4/4 [00:04<00:00,  1.23s/it]
100%|██████████| 4/4 [00:05<00:00,  1.38s/it]


In [10]:
results.metrics

{'latency/mean': 1.2901974320411682,
 'latency/variance': 0.02751488093023724,
 'latency/p90': 1.4098666667938233,
 'toxicity/v1/mean': 0.00019685498045873828,
 'toxicity/v1/variance': 1.47081003484041e-09,
 'toxicity/v1/p90': 0.00023989629989955576,
 'toxicity/v1/ratio': 0.0,
 'flesch_kincaid_grade_level/v1/mean': 13.225,
 'flesch_kincaid_grade_level/v1/variance': 4.771875,
 'flesch_kincaid_grade_level/v1/p90': 15.34,
 'ari_grade_level/v1/mean': 16.05,
 'ari_grade_level/v1/variance': 7.247499999999999,
 'ari_grade_level/v1/p90': 18.68,
 'exact_match/v1': 0.0,
 'answer_similarity/v1/mean': 4.0,
 'answer_similarity/v1/variance': 0.5,
 'answer_similarity/v1/p90': 4.7,
 'answer_relevance/v1/mean': 5.0,
 'answer_relevance/v1/variance': 0.0,
 'answer_relevance/v1/p90': 5.0,
 'answer_correctness/v1/mean': 4.75,
 'answer_correctness/v1/variance': 0.1875,
 'answer_correctness/v1/p90': 5.0}

In [12]:
results.tables["eval_results_table"]

Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 1218.21it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 1457.87it/s]


Unnamed: 0,inputs,ground_truth,outputs,latency,token_count,toxicity/v1/score,flesch_kincaid_grade_level/v1/score,ari_grade_level/v1/score,answer_similarity/v1/score,answer_similarity/v1/justification,answer_relevance/v1/score,answer_relevance/v1/justification,answer_correctness/v1/score,answer_correctness/v1/justification
0,How does useEffect() work?,The useEffect() hook tells React that your com...,The `useEffect()` hook in React is used to per...,1.380328,59,0.000193,13.2,16.1,4,The provided output aligns closely with the ta...,5,The output directly addresses the input questi...,5,The output provided by the model is correct. I...
1,What does the static keyword in a function mean?,"Static members belongs to the class, rather th...",The static keyword in a function means that th...,1.422526,62,0.000159,13.8,16.3,3,The output has moderate semantic similarity to...,5,The output provided by the model directly addr...,4,The output provided by the model is mostly cor...
2,What does the 'finally' block in Python do?,'Finally' defines a block of code to run when ...,The 'finally' block in Python is used to defin...,1.006224,52,0.00026,9.9,12.1,5,The model's output aligns closely with the pro...,5,The output directly addresses the input questi...,5,The output provided by the model is correct. I...
3,What is the difference between multiprocessing...,Multithreading refers to the ability of a proc...,Multiprocessing involves using multiple proces...,1.351712,61,0.000175,16.0,19.7,4,The provided output aligns closely with the ta...,5,The output provided by the model directly addr...,5,The output provided by the model is correct. I...


## Scenario 2: Evaluating existing responses in the DataFrame

In [13]:
# Step 1: Create the sample DataFrame with model-generated answers
eval_df = pd.DataFrame(
    {
        "inputs": [
            "How does useEffect() work?",
            "What does the static keyword in a function mean?",
            "What does the 'finally' block in Python do?",
            "What is the difference between multiprocessing and multithreading?",
        ],
        "ground_truth": [
            "The useEffect() hook tells React that your component needs to do something after render. React will remember the function you passed (we’ll refer to it as our “effect”), and call it later after performing the DOM updates.",
            "Static members belong to the class, rather than a specific instance. This means that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any. It will be shared by all objects.",
            "'Finally' defines a block of code to run when the try... except...else block is final. The finally block will be executed no matter if the try block raises an error or not.",
            "Multithreading refers to the ability of a processor to execute multiple threads concurrently, where each thread runs a process. Whereas multiprocessing refers to the ability of a system to run multiple processors in parallel, where each processor can run one or more threads.",
        ],
        "predicted_answer": [
            "The useEffect() hook is used to handle side effects in React. It runs after rendering and updates based on dependencies.",
            "Static members belong to a class and are shared among all instances. They are initialized only once at the start.",
            "The 'finally' block always runs after the try-except block, regardless of whether an error occurred. It is used for cleanup actions.",
            "Multithreading allows running multiple threads concurrently, whereas multiprocessing involves running multiple processes with separate memory space.",
        ],
    }
)

In [14]:
# Step 2: Start an MLflow run for evaluation
with mlflow.start_run() as run:
    # Step 3: Evaluate the existing predictions against ground truth
    results = mlflow.evaluate(
        model=None,  # No model URI needed since we are evaluating existing outputs
        data=eval_df,
        targets="ground_truth",  # Column with expected answers
        predictions="predicted_answer",  # Column with model-generated answers
        model_type="question-answering",  # Relevant model type
        evaluators="default",
        extra_metrics=[
            mlflow.metrics.latency(),
            mlflow.metrics.exact_match(),
            answer_similarity(),
            answer_relevance(),
            answer_correctness(),
        ],
    )

2024/10/24 13:52:09 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
100%|██████████| 1/1 [00:06<00:00,  6.42s/it]
100%|██████████| 1/1 [00:04<00:00,  4.90s/it]
100%|██████████| 1/1 [00:06<00:00,  6.26s/it]
100%|██████████| 4/4 [00:04<00:00,  1.14s/it]
100%|██████████| 4/4 [00:04<00:00,  1.21s/it]
100%|██████████| 4/4 [00:05<00:00,  1.49s/it]


In [15]:
results.metrics

{'latency/mean': 0.0,
 'latency/variance': 0.0,
 'latency/p90': 0.0,
 'toxicity/v1/mean': 0.0001968843316717539,
 'toxicity/v1/variance': 4.421298881753588e-09,
 'toxicity/v1/p90': 0.00026772937271744016,
 'toxicity/v1/ratio': 0.0,
 'flesch_kincaid_grade_level/v1/mean': 9.875,
 'flesch_kincaid_grade_level/v1/variance': 44.236875000000005,
 'flesch_kincaid_grade_level/v1/p90': 17.070000000000004,
 'ari_grade_level/v1/mean': 11.925,
 'ari_grade_level/v1/variance': 64.53187499999999,
 'ari_grade_level/v1/p90': 20.660000000000004,
 'exact_match/v1': 0.0,
 'answer_similarity/v1/mean': 4.25,
 'answer_similarity/v1/variance': 0.1875,
 'answer_similarity/v1/p90': 4.7,
 'answer_relevance/v1/mean': 4.0,
 'answer_relevance/v1/variance': 1.5,
 'answer_relevance/v1/p90': 5.0,
 'answer_correctness/v1/mean': 4.25,
 'answer_correctness/v1/variance': 0.1875,
 'answer_correctness/v1/p90': 4.7}

In [16]:
results.tables["eval_results_table"]

Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 1064.00it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 768.61it/s] 


Unnamed: 0,inputs,ground_truth,predicted_answer,latency,token_count,toxicity/v1/score,flesch_kincaid_grade_level/v1/score,ari_grade_level/v1/score,answer_similarity/v1/score,answer_similarity/v1/justification,answer_relevance/v1/score,answer_relevance/v1/justification,answer_correctness/v1/score,answer_correctness/v1/justification
0,How does useEffect() work?,The useEffect() hook tells React that your com...,The useEffect() hook is used to handle side ef...,0,23,0.000165,7.2,7.4,4,The provided output aligns closely with the ta...,4,The output provided by the model is relevant t...,4,The output provided by the model is mostly cor...
1,What does the static keyword in a function mean?,"Static members belong to the class, rather tha...",Static members belong to a class and are share...,0,22,0.000167,4.8,5.7,4,The provided output aligns closely with the ta...,2,The output provided by the model does give som...,4,The output provided by the model is mostly cor...
2,What does the 'finally' block in Python do?,'Finally' defines a block of code to run when ...,The 'finally' block always runs after the try-...,0,28,0.000311,6.2,8.9,5,The provided output closely aligns with the ta...,5,The output directly addresses the input questi...,5,The output provided by the model is correct. I...
3,What is the difference between multiprocessing...,Multithreading refers to the ability of a proc...,Multithreading allows running multiple threads...,0,20,0.000145,21.3,25.7,4,The provided output aligns closely with the ta...,5,The output directly addresses the input questi...,4,The output provided by the model is mostly cor...
