In [0]:
pip install --upgrade "mlflow[databricks]>=3.1.0" openai "databricks-connect>=16.1"

In [0]:
import mlflow
mlflow.set_experiment("/Users/benmackenzie3775@gmail.com/ex1")

In [0]:
import mlflow
from openai import OpenAI

# Enable automatic tracing for all OpenAI API calls
#mlflow.openai.autolog()

# Connect to a Databricks LLM via OpenAI using the same credentials as MLflow
# Alternatively, you can use your own OpenAI credentials here
mlflow_creds = mlflow.utils.databricks_utils.get_databricks_host_creds()
client = OpenAI(
    api_key=mlflow_creds.token,
    base_url=f"{mlflow_creds.host}/serving-endpoints"
)

# Create a RAG app with tracing
@mlflow.trace
def my_chatbot(user_question: str) -> str:
    # Retrieve relevant context
    context = retrieve_context(user_question)

    # Generate response using LLM with retrieved context
    response = client.chat.completions.create(
        model="databricks-llama-4-maverick",  # If using OpenAI directly, use "gpt-4o" or "gpt-3.5-turbo"
        messages=[
            {"role": "system", "content": "You are a helpful assistant. Use the provided context to answer questions."},
            {"role": "user", "content": f"Context: {context}\n\nQuestion: {user_question}"}
        ],
        temperature=0.7,
        max_tokens=150
    )
    return response.choices[0].message.content

@mlflow.trace(span_type="RETRIEVER")
def retrieve_context(query: str) -> str:
    # Simulated retrieval - in production, this would search a vector database
    if "mlflow" in query.lower():
        return "MLflow is an open-source platform for managing the end-to-end machine learning lifecycle. It provides tools for experiment tracking, model packaging, and deployment."
    return "General information about machine learning and data science."

# Run the app to generate a trace
response = my_chatbot("What is MLflow?")
print(f"Response: {response}")

# Get the trace ID for the next step
trace_id = mlflow.get_last_active_trace_id()
print(f"Trace ID: {trace_id}")

In [0]:
import mlflow
from mlflow.entities.assessment import AssessmentSource, AssessmentSourceType

# Simulate end-user feedback from your app
# In production, this would be triggered when a user clicks thumbs down in your UI
mlflow.log_feedback(
    trace_id=trace_id,
    name="user_feedback",
    value=False,  # False for thumbs down - user is unsatisfied
    rationale="Missing details about MLflow's key features like Projects and Model Registry",
    source=AssessmentSource(
        source_type=AssessmentSourceType.HUMAN,
        source_id="enduser_123",  # Would be actual user ID in production
    ),
)

print("✅ End-user feedback recorded!")

# In a real app, you would:
# 1. Return the trace_id with your response to the frontend
# 2. When user clicks thumbs up/down, call your backend API
# 3. Your backend would then call mlflow.log_feedback() with the trace_id

In [0]:
import mlflow
from mlflow.genai.label_schemas import create_label_schema, InputCategorical, InputText, InputTextList
from mlflow.genai.labeling import create_labeling_session

# Define what feedback to collect
expected_facts_schema = create_label_schema(
    name="expected_facts",
    type="expectation",  #Correctness judge requires expectations type
    title="What are the expected facts?",
    input=InputTextList(),
    overwrite=True
)


# Create a labeling session
labeling_session = create_labeling_session(
    name="quickstart_review",
    label_schemas=[expected_facts_schema.name],
)

# Add your trace to the session
# Get the most recent trace from the current experiment
traces = mlflow.search_traces(
    max_results=1  # Gets the most recent trace
)
labeling_session.add_traces(traces)

# Share with reviewers
print(f"✅ Trace sent for review!")
print(f"Share this link with reviewers: {labeling_session.url}")

In [0]:
labeled_traces = mlflow.search_traces(
    run_id=labeling_session.mlflow_run_id,  # Labeling Sessions are MLflow Runs
)

In [0]:
labeled_traces

In [0]:
import mlflow
import mlflow.genai.datasets
import time


# 1. Create an evaluation dataset

# Replace with a  Catalog schema where you have CREATE TABLE permission
uc_schema = "ml_demo.default"
# This table will be created in the above UC schema
evaluation_dataset_table_name = "databricks_agent_eval_v2"

mlflow.genai.delete_dataset(uc_table_name=f"{uc_schema}.{evaluation_dataset_table_name}")

eval_dataset = mlflow.genai.datasets.create_evaluation_dataset(
    uc_table_name=f"{uc_schema}.{evaluation_dataset_table_name}",
)
print(f"Created evaluation dataset: {uc_schema}.{evaluation_dataset_table_name}")


eval_dataset.merge_records(labeled_traces)
print(f"Added {len(traces)} records to evaluation dataset")

# Preview the dataset
df = eval_dataset.to_df()
print(f"\nDataset preview:")
print(f"Total records: {len(df)}")
print("\nSample record:")
sample = df.iloc[0]
print(f"Inputs: {sample['inputs']}")

In [0]:
uc_schema = "ml_demo.default"
# This table will be created in the above UC schema
evaluation_dataset_table_name = "databricks_agent_eval_v2"

eval_dataset = mlflow.genai.get_dataset(uc_table_name=f"{uc_schema}.{evaluation_dataset_table_name}")

In [0]:
import mlflow
from mlflow.genai.scorers import Correctness

# Evaluate your app against expert expectations
eval_results = mlflow.genai.evaluate(
    data=eval_dataset,
    predict_fn=my_chatbot,  # The app we created in Step 1
    scorers=[Correctness()]  # Compares outputs to expected_response
)

###Observations

1. There is no tracability to the 'model' yet.
2. I don't see the Expectations in the experiment UI. ** it's in the trace..not the table **


In [0]:
%%writefile function_agent.py
import mlflow
from openai import OpenAI

# Enable automatic tracing for all OpenAI API calls
#mlflow.openai.autolog()

# Connect to a Databricks LLM via OpenAI using the same credentials as MLflow
# Alternatively, you can use your own OpenAI credentials here
mlflow_creds = mlflow.utils.databricks_utils.get_databricks_host_creds()
client = OpenAI(
    api_key=mlflow_creds.token,
    base_url=f"{mlflow_creds.host}/serving-endpoints"
)

# Create a RAG app with tracing
@mlflow.trace
def my_chatbot(user_question: str) -> str:
    # Retrieve relevant context
    context = retrieve_context(user_question)

    # Generate response using LLM with retrieved context
    response = client.chat.completions.create(
        model="databricks-llama-4-maverick",  # If using OpenAI directly, use "gpt-4o" or "gpt-3.5-turbo"
        messages=[
            {"role": "system", "content": "You are a helpful assistant. Use the provided context to answer questions."},
            {"role": "user", "content": f"Context: {context}\n\nQuestion: {user_question}"}
        ],
        temperature=0.7,
        max_tokens=150
    )
    return response.choices[0].message.content

@mlflow.trace(span_type="RETRIEVER")
def retrieve_context(query: str) -> str:
    # Simulated retrieval - in production, this would search a vector database
    if "mlflow" in query.lower():
        return "MLflow is an open-source platform for managing the end-to-end machine learning lifecycle. It provides tools for experiment tracking, model packaging, and deployment."
    return "General information about machine learning and data science."

mlflow.models.set_model(my_chatbot)

In [0]:
import mlflow
from mlflow.models.resources import DatabricksServingEndpoint

resources = [DatabricksServingEndpoint(endpoint_name="databricks-llama-4-maverick")]

with mlflow.start_run(run_name="chatbot_v1"):
    logged_agent_info = mlflow.pyfunc.log_model(
        name="agent",
        python_model="chatbot_v1.py",
        pip_requirements=[
            "mlflow",
            "pydantic",
        ],
        resources=resources,
    )


## Observations

1. without a run name I would not be able to distinguish future models without looking at artficts.
2. when I run the eval using this model it will track it in the traces.

In [0]:
loaded_model = mlflow.pyfunc.load_model(f"runs:/d72c0c3b1b1c45e3bc20c69078d5662a/agent")


In [0]:
def eval_fn(user_question: str) -> str:
    return loaded_model.predict(user_question)

In [0]:
import mlflow
from mlflow.genai.scorers import Correctness


eval_results = mlflow.genai.evaluate(
    data=eval_dataset.to_df(),
    predict_fn=eval_fn,
    scorers=[Correctness()]  # Compares outputs to expected_response
)


##Try to register model with Unity Catalog requires a signature which ruins everything else


In [0]:
uc_registered_model_info = mlflow.register_model(
    model_uri="runs:/d72c0c3b1b1c45e3bc20c69078d5662a/agent",
    name="ml_demo.default.agent_1"
)

In [0]:
from mlflow.models.signature import infer_signature
from mlflow.models.resources import DatabricksServingEndpoint
import pandas as pd

# Create example input and output dataframes
inputs = pd.DataFrame(["What is MLflow?"], columns=["user_question"])
outputs = pd.DataFrame(["MLflow is an open-source platform for managing the end-to-end machine learning lifecycle. It provides tools for experiment tracking, model packaging, and deployment"], columns=["answer"])

signature = infer_signature(inputs, outputs)

resources = [DatabricksServingEndpoint(endpoint_name="databricks-llama-4-maverick")]

with mlflow.start_run(run_name="chatbot_v2"):
    logged_agent_info = mlflow.pyfunc.log_model(
        name="agent",
        python_model="chatbot_v1.py",
        pip_requirements=[
            "mlflow",
            "pydantic",
        ],
        resources=resources,
        signature=signature
    )

In [0]:
uc_registered_model_info = mlflow.register_model(logged_agent_info.model_uri, "ml_demo.default.agent_2")

In [0]:
loaded_model=mlflow.pyfunc.load_model(logged_agent_info.model_uri)
def eval_fn(user_question: str) -> str:
    return loaded_model.predict(user_question)

eval_results = mlflow.genai.evaluate(
    data=eval_dataset.to_df(),
    #data = df,
    predict_fn=eval_fn,
    scorers=[Correctness()]  # Compares outputs to expected_response
)


In [0]:
import mlflow.pyfunc

loaded_model = mlflow.pyfunc.load_model("ml_demo.default.agent_2")