# Example 1-Hot Encoding Decomposition
This is a simple example on how to decompose 1-hot encoded features and/or shap values into their original multi-class state prior to sending data to Arize.

In this case, we have features, predicions, actuals, and their respective SHAP values all in a single dataframe. In the case where your data is not colocated, you can send each peice (prediction, actual, and SHAP values) separatedly as long as the `prediction_id` variable from a SHAP and/or Actual latent call matches a previously sent Prediction.

In [25]:
import pandas as pd

## Sample data set with features, predictions, actuals and shap values
df = pd.read_csv('https://storage.googleapis.com/arize-assets/fixtures/example_shap_data.zip')

In [26]:
## Here is an example of data where some features being 1-hot encoded while others are not
df.head(2)

Unnamed: 0,term_one_h_0_shap,term_one_h_1_shap,term_one_h_2_shap,installment_shap,grade_shap,home_ownership_shap,annual_income_shap,verification_status_shap,pymnt_plan_shap,purpose_shap,...,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,fico_score,fico_range,prediction,actual
0,0.66,0.1,0.2,0.1,0.2,0.43,0.26,0.4,0.3,0.4,...,0.9,1.6,0.8,1.2,0.6,1.5,1.5,2.2,Default,Not Default
1,0.64,0.01,0.01,0.01,0.02,0.43,0.34,0.04,0.02,5.6,...,0.1,0.15,0.16,0.14,0.02,0.04,0.1,0.16,Not Default,Not Default


In [27]:
## Since we need the same feature names as the original prediction inputs, we'll need to "un-encode" the 1-hot encoded features
## In this case addr_state and term features were 1-hot encoded, so we create a dictionary where keys are the decomposed names 
## and the values are all the 1-hot encoded names
encoding_map = {"term_shap": ['term_one_h_0_shap', 'term_one_h_1_shap', 'term_one_h_2_shap','term_one_h_3_shap'],
              "addr_state_shap": ['addr_state_one_h_0_shap', 'addr_state_one_h_1_shap', 'addr_state_one_h_2_shap']}

In [28]:
## This helper function decomposed the 1-hot encoded columns into their original names.
## We calculate the sum of the SHAP values for each origial column from each 1-hot column
## Reference: https://github.com/slundberg/shap/issues/679#issuecomment-508575567

def map_shap(shap_df, one_h_map):
  for key, value in one_h_map.items():
    shap_df[key] = shap_df[value].sum(axis=1)
    shap_df = shap_df.drop(columns=value)
  return shap_df

In [29]:
shap_values = map_shap(df, encoding_map)
shap_values.head(2)

Unnamed: 0,installment_shap,grade_shap,home_ownership_shap,annual_income_shap,verification_status_shap,pymnt_plan_shap,purpose_shap,inq_last_6mths_shap,mths_since_last_delinq_shap,mths_since_last_record_shap,...,pub_rec,revol_bal,revol_util,total_acc,fico_score,fico_range,prediction,actual,term_shap,addr_state_shap
0,0.1,0.2,0.43,0.26,0.4,0.3,0.4,0.3,0.7,0.9,...,0.8,1.2,0.6,1.5,1.5,2.2,Default,Not Default,2.56,1.3
1,0.01,0.02,0.43,0.34,0.04,0.02,5.6,0.13,0.01,0.1,...,0.16,0.14,0.02,0.04,0.1,0.16,Not Default,Not Default,0.88,0.16


In [30]:
## Features names for your model
feature_names = ['installment', 'grade', 'home_ownership', 'annual_income',
       'verification_status', 'pymnt_plan', 'purpose', 'inq_last_6mths',
       'mths_since_last_delinq', 'mths_since_last_record', 'open_acc',
       'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'fico_score',
       'fico_range', 'term', 'addr_state']

In [31]:
## Helper function to get name of shap columns
def get_shap_column_names(feature_names):
  shap_column_names = []
  for name in feature_names:
    shap_column_names.append(f"{name}_shap")
  return shap_column_names

shap_column_names = get_shap_column_names(feature_names)

In [34]:
## Now send decomposed Shap values into Arize
!pip install -q arize
from arize.utils.types import ModelTypes, Environments
from arize.pandas.logger import Client, Schema

ORGANIZATION_KEY = "ORGANIZATION_KEY"
API_KEY = "API_KEY"
arize_client = Client(organization_key=ORGANIZATION_KEY, api_key=API_KEY)

In [35]:
response = arize_client.log(
    dataframe=shap_values,
    model_id="Example-SHAP-Decomposition",
    model_version="1.0",
    model_type=ModelTypes.CATEGORICAL,
    environment=Environments.PRODUCTION,
    schema = Schema(
        prediction_id_column_name="ids",
        prediction_label_column_name="prediction",
        actual_label_column_name="actual",
        feature_column_names=feature_names,
        shap_values_column_names=dict(zip(feature_names, shap_column_names)),
    )
)

if response.status_code != 200:
    print(f"logging failed with response code {response.status_code}, {response.text}")
else:
    print(f"logging completed with response code {response.status_code}")

logging completed with response code 200
