## Send 1 million inferences with 200 features to Arize in chunks of 100K records at a time

Included is sample code of how you can split large dataframes into smaller chunks prior to sending to Arize.

In [None]:
!pip -q install arize

import datetime
import random
import uuid
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
from arize.pandas.logger import Client
from arize.utils.types import Environments, ModelTypes, Schema

import arize

print(
    f"Step 1 ✅: Install Arize, you are using sdk version: {arize.__version__}"
)

### Set up Arize Client with your Space ID and API key
You can find your `API_KEY` and `SPACE_ID` by navigating to the space settings page as shown below (only space admins can see the keys). 


<img src="https://storage.cloud.google.com/arize-assets/fixtures/copy-id-and-key.png" width="700">

In [None]:
SPACE_ID = "SPACE_ID"
API_KEY = "API_KEY"
arize_client = Client(space_id=SPACE_ID, api_key=API_KEY)

if SPACE_ID == "SPACE_ID" or API_KEY == "API_KEY":
    raise ValueError("❌ CHANGE SPACE_ID AND/OR API_KEY")
else:
    print(
        "Step 2 ✅: Import and Setup Arize Client Done! Now we can start using Arize!"
    )

### Sample functions to emulate a large dataset

In [None]:
def simulate_production_timestamps(X, days=30):
    t = datetime.now()
    current_ts, earlier_ts = (
        t.timestamp(),
        (t - timedelta(days=days)).timestamp(),
    )
    return pd.Series(
        np.linspace(earlier_ts, current_ts, num=len(X)), index=X.index
    )


def get_feature_columns(num_cols):
    cols = []
    for i in range(0, num_cols):
        cols.append(f"feat_{i}")
    return cols


def get_shap_columns(num_cols):
    cols = []
    for i in range(0, num_cols):
        cols.append(f"feat_{i}")
    return cols


def generate_prediction_ids(X):
    return pd.Series((str(uuid.uuid4()) for _ in range(len(X))), index=X.index)

### Generate random data for a DataFrames which we will chunk on a later step

In [None]:
LABELS = ["Item 1", "Item 2", "Item 3"]

NUM_RECORDS = 1_000_000
NUM_FEATS = 200

feat_names = get_feature_columns(NUM_FEATS)

features = pd.DataFrame(
    np.random.random(size=(NUM_RECORDS, NUM_FEATS)),
    columns=feat_names,
)

shap_values_column_names_mapping = {
    f"{feat}": f"{feat}_shap" for feat in feat_names
}

shap_columns = [shap_values_column_names_mapping.get(n, n) for n in feat_names]

shap_values = pd.DataFrame(
    np.random.random(size=(NUM_RECORDS, NUM_FEATS)),
    columns=shap_columns,
)
shap_values.rename(columns=shap_values_column_names_mapping)

pLabels = [random.choice(LABELS) for i in range(NUM_RECORDS)]
pred_labels = pd.DataFrame(
    np.random.random(size=(NUM_RECORDS, 2)),
    columns=["prediction_label", "prediction_score"],
)
pred_labels["prediction_label"] = pLabels

aLabels = [random.choice(LABELS) for i in range(NUM_RECORDS)]
actual_labels = pd.DataFrame(
    np.random.random(size=(NUM_RECORDS, 2)),
    columns=["actual_label", "actual_score"],
)
actual_labels["actual_label"] = aLabels

ids = pd.DataFrame(
    [str(uuid.uuid4()) for _ in range(NUM_RECORDS)], columns=["prediction_id"]
)

inferences = pd.concat(
    [features, pred_labels, ids, actual_labels, shap_values], axis=1
)

inferences["prediction_ts"] = simulate_production_timestamps(inferences, 364)

### Send data to Arize

In [None]:
production_schema = Schema(
    prediction_id_column_name="prediction_id",
    timestamp_column_name="prediction_ts",
    prediction_label_column_name="prediction_label",
    prediction_score_column_name="prediction_score",
    actual_label_column_name="actual_label",
    actual_score_column_name="actual_score",
    feature_column_names=feat_names,
    shap_values_column_names=shap_values_column_names_mapping,
)

### We will chuck the dataframe into 100K records at a time
start = 0
stop = 100_000
step = stop

model_id = "model_to_split"
model_version = "1.0"
model_type = ModelTypes.SCORE_CATEGORICAL

while stop <= len(inferences):
    try:
        response = arize_client.log(
            dataframe=inferences.iloc[start:stop],
            schema=production_schema,
            model_id=model_id,
            model_version=model_version,
            model_type=model_type,
            environment=Environments.PRODUCTION,
        )
        # If successful, the server will return a status_code of 200
        if response.status_code != 200:
            ## In the case a 200 was not received, you'll want to try the chunk again, so we dont increment the start/stop variables
            print(
                f"❌ Logging failed with response code {response.status_code}, {response.text}, will try again"
            )
        else:
            print(
                f"✅ You have successfully logged records from index {start} to {stop} to Arize!"
            )
            ## If we got a 200 ACK, we can move on to the next chunk
            start = start + step
            stop = stop + step
    except Exception as err:
        print(
            f"An exception occurred when logging index {start} to {stop}, trying again\n Exception: {err}"
        )

### Check Data Ingestion Information

Data will be available in the UI in about 10 minutes after it was received. If data from a new model is sent, the model will be reflected almost immediately in the Arize platform. However, you will not see data yet. To verify data has been sent correctly and is being processed, we recommend that you check our Data Ingestion tab.

You will be able to see the predictions, actuals, and feature importances that have been sent in the last week, last day or last 30 minutes.

An example view of the Data Ingestion tab from a model, when data is sent continuously over 30 minutes, is shown in the image below.

<img src="https://storage.cloud.google.com/arize-assets/fixtures/data-ingestion-tab.png" width="700">



### Overview
Arize is an end-to-end ML observability and model monitoring platform. The platform is designed to help ML engineers and data science practitioners surface and fix issues with ML models in production faster with:
- Automated ML monitoring and model monitoring
- Workflows to troubleshoot model performance
- Real-time visualizations for model performance monitoring, data quality monitoring, and drift monitoring
- Model prediction cohort analysis
- Pre-deployment model validation
- Integrated model explainability

### Website
Visit Us At: https://arize.com/model-monitoring/

### Additional Resources
- [What is ML observability?](https://arize.com/what-is-ml-observability/)
- [Playbook to model monitoring in production](https://arize.com/the-playbook-to-monitor-your-models-performance-in-production/)
- [Using statistical distance metrics for ML monitoring and observability](https://arize.com/using-statistical-distance-metrics-for-machine-learning-observability/)
- [ML infrastructure tools for data preparation](https://arize.com/ml-infrastructure-tools-for-data-preparation/)
- [ML infrastructure tools for model building](https://arize.com/ml-infrastructure-tools-for-model-building/)
- [ML infrastructure tools for production](https://arize.com/ml-infrastructure-tools-for-production-part-1/)
- [ML infrastructure tools for model deployment and model serving](https://arize.com/ml-infrastructure-tools-for-production-part-2-model-deployment-and-serving/)
- [ML infrastructure tools for ML monitoring and observability](https://arize.com/ml-infrastructure-tools-ml-observability/)

Visit the [Arize Blog](https://arize.com/blog) and [Resource Center](https://arize.com/resource-hub/) for more resources on ML observability and model monitoring.
