## Send 1 million inferences with 200 features to Arize in chunks of 100K records at a time

Included is sample code of how you can split large dataframes into smaller chunks prior to sending to Arize.

In [1]:
!pip -q install arize

import datetime
import random
import time
import uuid
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
from arize.pandas.logger import Client, Schema
from arize.utils.types import Environments, ModelTypes

import arize

print(f"Step 1 ✅: Install Arize, you are using sdk version: {arize.__version__}")

[K     |████████████████████████████████| 25.5 MB 51.8 MB/s 
[?25hStep 1 ✅: Install Arize, you are using sdk version: 3.1.3


### Set up Arize Client with your API and Space Keys

In [10]:
SPACE_KEY = "SPACE_KEY"
API_KEY = "API_KEY"
arize_client = Client(space_key=SPACE_KEY, api_key=API_KEY, uri=uri)

if SPACE_KEY == "SPACE_KEY" or API_KEY == "API_KEY":
    raise ValueError("❌ NEED TO CHANGE SPACE AND/OR API_KEY")
else:
    print("Step 2 ✅: Import and Setup Arize Client Done! Now we can start using Arize!")

Step 2 ✅: Import and Setup Arize Client Done! Now we can start using Arize!


### Sample functions to emulate a large dataset

In [3]:
def simulate_production_timestamps(X, days=30):
    t = datetime.now()
    current_ts, earlier_ts = t.timestamp(), (t - timedelta(days=days)).timestamp()
    return pd.Series(np.linspace(earlier_ts, current_ts, num=len(X)), index=X.index)


def get_feature_columns(num_cols):
    cols = []
    for i in range(0, num_cols):
        cols.append(f"feat_{i}")
    return cols


def get_shap_columns(num_cols):
    cols = []
    for i in range(0, num_cols):
        cols.append(f"feat_{i}")
    return cols


def generate_prediction_ids(X):
    return pd.Series((str(uuid.uuid4()) for _ in range(len(X))), index=X.index)

### Generate random data for a DataFrames which we will chunk on a later step

In [4]:
LABELS = ["Item 1", "Item 2", "Item 3"]

NUM_RECORDS = 1_000_000
NUM_FEATS = 200

feat_names = get_feature_columns(NUM_FEATS)

features = pd.DataFrame(
    np.random.random(size=(NUM_RECORDS, NUM_FEATS)),
    columns=feat_names,
)

shap_values_column_names_mapping = {f"{feat}": f"{feat}_shap" for feat in feat_names}

shap_columns = [shap_values_column_names_mapping.get(n, n) for n in feat_names]

shap_values = pd.DataFrame(
    np.random.random(size=(NUM_RECORDS, NUM_FEATS)),
    columns=shap_columns,
)
shap_values.rename(columns=shap_values_column_names_mapping)

pLabels = [random.choice(LABELS) for i in range(NUM_RECORDS)]
pred_labels = pd.DataFrame(
    np.random.random(size=(NUM_RECORDS, 2)),
    columns=["prediction_label", "prediction_score"],
)
pred_labels["prediction_label"] = pLabels

aLabels = [random.choice(LABELS) for i in range(NUM_RECORDS)]
actual_labels = pd.DataFrame(
    np.random.random(size=(NUM_RECORDS, 2)), columns=["actual_label", "actual_score"]
)
actual_labels["actual_label"] = aLabels

ids = pd.DataFrame(
    [str(uuid.uuid4()) for _ in range(NUM_RECORDS)], columns=["prediction_id"]
)

inferences = pd.concat([features, pred_labels, ids, actual_labels, shap_values], axis=1)

inferences["prediction_ts"] = simulate_production_timestamps(inferences, 364)

### Send data to Arize

In [11]:
production_schema = Schema(
    prediction_id_column_name="prediction_id",
    timestamp_column_name="prediction_ts",
    prediction_label_column_name="prediction_label",
    prediction_score_column_name="prediction_score",
    actual_label_column_name="actual_label",
    actual_score_column_name="actual_score",
    feature_column_names=feat_names,
    shap_values_column_names=shap_values_column_names_mapping,
)

### We will chuck the dataframe into 100K records at a time
start = 0
stop = 100_000
step = stop

model_id = "model_to_split"
model_version = "1.0"
model_type = ModelTypes.SCORE_CATEGORICAL

while stop <= len(inferences):
    try:
        response = arize_client.log(
            dataframe=inferences.iloc[start:stop],
            schema=production_schema,
            model_id=model_id,
            model_version=model_version,
            model_type=model_type,
            environment=Environments.PRODUCTION,
        )
        # If successful, the server will return a status_code of 200
        if response.status_code != 200:
            ## In the case a 200 was not received, you'll want to try the chunk again, so we dont increment the start/stop variables
            print(
                f"❌ Logging failed with response code {response.status_code}, {response.text}, will try again"
            )
        else:
            print(
                f"✅ You have successfully logged records from index {start} to {stop} to Arize!"
            )
            ## If we got a 200 ACK, we can move on to the next chunk
            start = start + step
            stop = stop + step
    except Exception as err:
        print(
            f"An exception occurred when logging index {start} to {stop}, trying again\n Exception: {err}"
        )

✅ You have successfully logged records from index 0 to 100000 to Arize!
✅ You have successfully logged records from index 100000 to 200000 to Arize!
✅ You have successfully logged records from index 200000 to 300000 to Arize!
