# Arize Setup

In [1]:
# !pip install -qe '.'
!pip install -qe '.[ml-batch]'
# !pip install -q 'arize[spans]==8.0.0a2'
!pip freeze | grep arize

-e git+ssh://git@github.com/Arize-ai/arize.git@853d50dd2866f19d0600e4eb740c9e9aa28bd27f#egg=arize&subdirectory=sdk/python/loggerv8


In [2]:
SPACE_ID = "U3BhY2U6NTA3MDpsTlIr"
MODEL_NAME = "test-sdkv8-batch-09-30-25-d"
MODEL_VERSION = "1.0"

In [3]:
import os

os.environ["ARIZE_LOG_ENABLE"] = "true"
os.environ["ARIZE_LOG_LEVEL"] = "debug"
os.environ["ARIZE_LOG_STRUCTURED"] = "false"

# Get Data

In [5]:
import pandas as pd

url = "https://storage.googleapis.com/arize-assets/fixtures/Embeddings/arize-demo-models-data/CV/Object-Detection/coco_detection_quality_drift"
train_df = pd.read_parquet(f"{url}_training.parquet")
prod_df = pd.read_parquet(f"{url}_production.parquet")

In [6]:
from datetime import datetime

last_ts = max(prod_df["prediction_ts"])
now_ts = datetime.timestamp(datetime.now())
delta_ts = now_ts - last_ts

train_df["prediction_ts"] = (train_df["prediction_ts"] + delta_ts).astype(float)
prod_df["prediction_ts"] = (prod_df["prediction_ts"] + delta_ts).astype(float)

In [7]:
import uuid


def add_prediction_id(df):
    return [str(uuid.uuid4()) for _ in range(df.shape[0])]

In [8]:
train_df["prediction_id"] = add_prediction_id(train_df)
prod_df["prediction_id"] = add_prediction_id(prod_df)

# Batch ML record 

In [10]:
import logging

from arize import ArizeClient

client = ArizeClient() # API key from env var

print("arize handlers:", logging.getLogger("arize").handlers)
print(
    "arize.spans.client handlers:",
    logging.getLogger("arize.spans.client").handlers,
)

print(client)

arize handlers: [<StreamHandler stdout (DEBUG)>]
arize.spans.client handlers: []
ArizeClient(
  sdk_config=SDKConfiguration(
    api_key='ak-ed5***',
    api_host='api.arize.com',
    api_scheme='https',
    flight_server_host='flight.arize.com',
    flight_server_port=443,
    flight_scheme='grpc+tls',
    request_verify=True,
    stream_max_workers=8,
    stream_max_queue_bound=5000,
  )
  subclients={
    'datasets': lazy,
    'experiments': lazy,
    'spans': lazy,
    'models': lazy,
  }
)


In [11]:
from arize.types import (
    EmbeddingColumnNames,
    Environments,
    ModelTypes,
    ObjectDetectionColumnNames,
    Schema,
)

tags = ["drift_type"]
embedding_feature_column_names = {
    "image_embedding": EmbeddingColumnNames(
        vector_column_name="image_vector", link_to_data_column_name="url"
    )
}
object_detection_prediction_column_names = ObjectDetectionColumnNames(
    bounding_boxes_coordinates_column_name="prediction_bboxes",
    categories_column_name="prediction_categories",
    scores_column_name="prediction_scores",
)
object_detection_actual_column_names = ObjectDetectionColumnNames(
    bounding_boxes_coordinates_column_name="actual_bboxes",
    categories_column_name="actual_categories",
)

# Define a Schema() object for Arize to pick up data from the correct columns for logging
schema = Schema(
    prediction_id_column_name="prediction_id",
    timestamp_column_name="prediction_ts",
    tag_column_names=tags,
    embedding_feature_column_names=embedding_feature_column_names,
    object_detection_prediction_column_names=object_detection_prediction_column_names,
    object_detection_actual_column_names=object_detection_actual_column_names,
)

## Log Training Data

In [12]:
# Logging Training DataFrame
response = client.models.log_batch(
    space_id=SPACE_ID,
    model_name=MODEL_NAME,
    model_type=ModelTypes.OBJECT_DETECTION,
    dataframe=train_df,
    schema=schema,
    environment=Environments.TRAINING,
    model_version=MODEL_VERSION,
)

# If successful, the server will return a status_code of 200
if response.status_code != 200:
    print(
        f"❌ logging failed with response code {response.status_code}, {response.text}"
    )
else:
    print("✅ You have successfully logged training set to Arize")

[38;5;39m  arize.models.client | DEBUG | Performing required validation.[0m
[38;5;39m  arize.models.client | DEBUG | Performing parameters validation.[0m
[38;5;39m  arize.models.client | DEBUG | Removing unnecessary columns.[0m
[38;5;39m  arize.models.client | DEBUG | Converting data to Arrow format[0m
[38;5;39m  arize.models.client | DEBUG | Performing types validation.[0m
[38;5;39m  arize.models.client | DEBUG | Performing values validation.[0m
[38;5;39m  arize.utils.arrow | DEBUG | Preparing to log Arrow table via file upload[0m
[38;5;39m  arize.utils.arrow | DEBUG | Preparing to log Arrow table via file upload | rows=7925 cols=10[0m
[38;5;39m  arize.utils.arrow | DEBUG | Serializing schema[0m
[38;5;39m  arize.utils.arrow | DEBUG | Writing table to temporary file: /var/folders/53/611kkp5s3ds5yjcy4_sl8yw80000gn/T/tmpi03q2icv/arize-uh9asz8q.arrow[0m
[38;5;39m  arize.utils.arrow | DEBUG | Uploading file to Arize | path='/var/folders/53/611kkp5s3ds5yjcy4_sl8yw80000g

## Log Production Data

In [13]:
from arize.types import Environments, ModelTypes

# Logging Training DataFrame
response = client.models.log_batch(
    space_id=SPACE_ID,
    model_name=MODEL_NAME,
    model_type=ModelTypes.OBJECT_DETECTION,
    dataframe=prod_df,
    schema=schema,
    environment=Environments.PRODUCTION,
    model_version=MODEL_VERSION,
)

# If successful, the server will return a status_code of 200
if response.status_code != 200:
    print(
        f"❌ logging failed with response code {response.status_code}, {response.text}"
    )
else:
    print("✅ You have successfully logged training set to Arize")

[38;5;39m  arize.models.client | DEBUG | Performing required validation.[0m
[38;5;39m  arize.models.client | DEBUG | Performing parameters validation.[0m
[38;5;39m  arize.models.client | DEBUG | Removing unnecessary columns.[0m
[38;5;39m  arize.models.client | DEBUG | Converting data to Arrow format[0m
[38;5;39m  arize.models.client | DEBUG | Performing types validation.[0m
[38;5;39m  arize.models.client | DEBUG | Performing values validation.[0m
[38;5;39m  arize.utils.arrow | DEBUG | Preparing to log Arrow table via file upload[0m
[38;5;39m  arize.utils.arrow | DEBUG | Preparing to log Arrow table via file upload | rows=31702 cols=10[0m
[38;5;39m  arize.utils.arrow | DEBUG | Serializing schema[0m
[38;5;39m  arize.utils.arrow | DEBUG | Writing table to temporary file: /var/folders/53/611kkp5s3ds5yjcy4_sl8yw80000gn/T/tmpofy_1mwu/arize-thnfbmmr.arrow[0m
[38;5;39m  arize.utils.arrow | DEBUG | Uploading file to Arize | path='/var/folders/53/611kkp5s3ds5yjcy4_sl8yw80000

# Export Data

In [15]:
from datetime import datetime

FMT = "%Y-%m-%d"
start_time = datetime.strptime("2024-01-01", FMT)
end_time = datetime.strptime("2026-01-01", FMT)
start_time, end_time

(datetime.datetime(2024, 1, 1, 0, 0), datetime.datetime(2026, 1, 1, 0, 0))

In [16]:
df = client.models.export_to_df(
    space_id=SPACE_ID,
    model_name=MODEL_NAME,
    environment=Environments.TRAINING,
    model_version=MODEL_VERSION,
    start_time=start_time,
    end_time=end_time,
)

[38;5;39m  arize._exporter.client | DEBUG | Getting stream reader... | component='exporter' operation='export_to_df' space_id='U3BhY2U6NTA3MDpsTlIr' model_id='test-sdkv8-batch-09-30-25-d' environment='TRAINING' model_version='1.0' batch_id='' include_actuals=False where='' columns=None similarity_search_params=None stream_chunk_size=None start_time=datetime.datetime(2024, 1, 1, 0, 0) end_time=datetime.datetime(2026, 1, 1, 0, 0)[0m
[38;21m  arize._exporter.client | INFO | Fetching data...[0m
[38;5;39m  arize._exporter.client | DEBUG | Ticket: <pyarrow.flight.Ticket ticket=b'{"datasetUuid":"07f3a775-30dd-40fc-952a-c3fb2ba334bd", "datasourceType":"PREPRODUCTION", "filters":[{"dimension":"environment", "value":"training"}, {"dimension":"modelVersion", "value":"1.0"}], "startTime":"2024-01-01T08:00:00Z", "endTime":"2026-01-01T08:00:00Z"}'>[0m


  exporting 55475 rows: 100%|[38;2;0;128;0m██████████████[0m| 55475/55475 [00:02, 23465.64 row/s][0m


In [17]:
df.columns, len(df.columns), len(df)

(Index(['index', 'drift_type__tag', 'boxPredictionCoordinates',
        'image_embedding__linkToData', 'boxPredictionScores',
        'boxPredictionLabels', 'predictionID', 'boxActualCoordinates',
        'image_embedding__embVector', 'boxActualLabels', 'time'],
       dtype='object'),
 11,
 55475)

In [18]:
df = client.models.export_to_df(
    space_id=SPACE_ID,
    model_name=MODEL_NAME,
    environment=Environments.TRAINING,
    model_version=MODEL_VERSION,
    start_time=start_time,
    end_time=end_time,
    include_actuals=True,
)

[38;5;39m  arize._exporter.client | DEBUG | Getting stream reader... | component='exporter' operation='export_to_df' space_id='U3BhY2U6NTA3MDpsTlIr' model_id='test-sdkv8-batch-09-30-25-d' environment='TRAINING' model_version='1.0' batch_id='' include_actuals=True where='' columns=None similarity_search_params=None stream_chunk_size=None start_time=datetime.datetime(2024, 1, 1, 0, 0) end_time=datetime.datetime(2026, 1, 1, 0, 0)[0m
[38;21m  arize._exporter.client | INFO | Fetching data...[0m
[38;5;39m  arize._exporter.client | DEBUG | Ticket: <pyarrow.flight.Ticket ticket=b'{"datasetUuid":"07f3a775-30dd-40fc-952a-c3fb2ba334bd", "datasourceType":"PREPRODUCTION", "filters":[{"dimension":"environment", "value":"training"}, {"dimension":"modelVersion", "value":"1.0"}], "startTime":"2024-01-01T08:00:00Z", "endTime":"2026-01-01T08:00:00Z"}'>[0m


  exporting 55475 rows: 100%|[38;2;0;128;0m██████████████[0m| 55475/55475 [00:02, 24390.45 row/s][0m


In [19]:
df.columns, len(df.columns), len(df)

(Index(['index', 'image_embedding__linkToData', 'drift_type__tag',
        'boxPredictionCoordinates', 'boxPredictionScores', 'boxActualLabels',
        'time', 'predictionID', 'image_embedding__embVector',
        'boxPredictionLabels', 'boxActualCoordinates'],
       dtype='object'),
 11,
 55475)