# Debugging

In [1]:
import datetime
import pandas as pd

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset

from evidently.test_suite import TestSuite
from evidently.test_preset import DataDriftTestPreset

import joblib
from joblib import dump, load

import warnings

warnings.filterwarnings("ignore")



In [2]:
data = pd.read_parquet('../data/gold/df_fraud_final.parquet', engine='fastparquet')

ref_data = data.iloc[:1000]
current_data = data.iloc[-100:]
current_data

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrig,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,day_of_month,amount_range,diffbalanceOrig,diffbalanceDest
6362520,735,TRANSFER,4.171037e+05,C336307904,4.171037e+05,0.0,C1155915285,0.000000e+00,0.000,fraud,31,100.000-1.000.000,-4.171037e+05,0.000000e+00
6362521,735,CASH_OUT,4.171037e+05,C1450763584,4.171037e+05,0.0,C1377830519,3.423206e+04,451335.750,fraud,31,100.000-1.000.000,-4.171037e+05,4.171037e+05
6362522,735,TRANSFER,9.273571e+04,C1351323617,9.273571e+04,0.0,C413722554,0.000000e+00,0.000,fraud,31,10.000-100.000,-9.273571e+04,0.000000e+00
6362523,735,CASH_OUT,9.273571e+04,C786761311,9.273571e+04,0.0,C570188819,9.215833e+05,1014319.000,fraud,31,10.000-100.000,-9.273571e+04,9.273569e+04
6362524,735,TRANSFER,1.231463e+05,C1625883009,1.231463e+05,0.0,C918154390,0.000000e+00,0.000,fraud,31,100.000-1.000.000,-1.231463e+05,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,3.396821e+05,C786484425,3.396821e+05,0.0,C776919290,0.000000e+00,339682.125,fraud,31,100.000-1.000.000,-3.396821e+05,3.396821e+05
6362616,743,TRANSFER,6.311410e+06,C1529008245,6.311410e+06,0.0,C1881841831,0.000000e+00,0.000,fraud,31,1.000.000-10.000.000,-6.311410e+06,0.000000e+00
6362617,743,CASH_OUT,6.311410e+06,C1162922333,6.311410e+06,0.0,C1365125890,6.848884e+04,6379898.000,fraud,31,1.000.000-10.000.000,-6.311410e+06,6.311409e+06
6362618,743,TRANSFER,8.500025e+05,C1685995037,8.500025e+05,0.0,C2080388513,0.000000e+00,0.000,fraud,31,100.000-1.000.000,-8.500025e+05,0.000000e+00


In [3]:
model_path = "../orchestration/models/random_forest_classifier.b"
with open(model_path, "rb") as f_in:
    model = joblib.load(f_in)

prep_path = "../orchestration/models/preprocessor_pipeline.b"
with open(prep_path, "rb") as f_in:
    preprocessor = joblib.load(f_in)

In [4]:
num_features = data.select_dtypes(include=['int', 'float']).columns.tolist()
cat_features = data.select_dtypes(include=['category']).columns.tolist()

In [5]:
problematic_data = current_data.loc[(current_data.amount > 2) & (current_data.amount < 100000)]
problematic_data

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrig,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,day_of_month,amount_range,diffbalanceOrig,diffbalanceDest
6362522,735,TRANSFER,92735.710938,C1351323617,92735.710938,0.0,C413722554,0.0,0.0,fraud,31,10.000-100.000,-92735.710938,0.0
6362523,735,CASH_OUT,92735.710938,C786761311,92735.710938,0.0,C570188819,921583.3,1014319.0,fraud,31,10.000-100.000,-92735.710938,92735.6875
6362526,736,TRANSFER,53315.308594,C914459666,53315.308594,0.0,C73348302,0.0,0.0,fraud,31,10.000-100.000,-53315.308594,0.0
6362527,736,CASH_OUT,53315.308594,C810591075,53315.308594,0.0,C1614619549,0.0,53315.31,fraud,31,10.000-100.000,-53315.308594,53315.308594
6362566,739,TRANSFER,8116.529785,C2072430566,8116.529785,0.0,C765215468,0.0,0.0,fraud,31,1.000-10.000,-8116.529785,0.0
6362567,739,CASH_OUT,8116.529785,C564539602,8116.529785,0.0,C1935865739,7638.26,15754.79,fraud,31,1.000-10.000,-8116.529785,8116.530273
6362586,741,TRANSFER,62297.179688,C1565612524,62297.179688,0.0,C435734396,0.0,0.0,fraud,31,10.000-100.000,-62297.179688,0.0
6362587,741,CASH_OUT,62297.179688,C1597957921,62297.179688,0.0,C734451013,1060553.0,1122850.0,fraud,31,10.000-100.000,-62297.179688,62297.125
6362596,741,TRANSFER,48442.878906,C1112979339,48442.878906,0.0,C2114078084,0.0,0.0,fraud,31,10.000-100.000,-48442.878906,0.0
6362597,741,CASH_OUT,48442.878906,C1706094385,48442.878906,0.0,C2109905271,513746.2,562189.1,fraud,31,10.000-100.000,-48442.878906,48442.875


In [6]:
column_mapping = ColumnMapping(
    prediction='prediction',
    numerical_features=num_features,
    categorical_features=cat_features,
    target=None
)


In [7]:
ref_processed_data = preprocessor.transform(ref_data)
ref_data['prediction'] = model.predict(ref_processed_data)

In [8]:
processed_data = preprocessor.transform(problematic_data)
processed_data


array([[ 7.34000000e+02, -1.07775047e-01, -3.76015872e-01,
        -3.29644799e-01, -1.08441031e+00, -2.69484907e-01,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00],
       [ 7.34000000e+02, -1.07775047e-01, -3.76015872e-01,
         1.45109162e-01, -1.08441031e+00, -2.23761156e-01,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00],
       [ 7.35000000e+02, -2.62392581e-01, -3.95514816e-01,
        -3.29644799e-01, -7.08835721e-01, -2.69484907e-01,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00],
       [ 7.35000000e+02, -2.62392581e-01, -3.95514816e-01,
        -3.29644799e-01, -7.08835721e-01, -2.43197575e-01,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00],
       [ 7.38000000e+02, -4.39674526e-01, -4.17872041e-01,
        -3.29644799e-01, -2.78208047e-01, -2.69484907e-01,
  

In [9]:
problematic_data['prediction'] = model.predict(processed_data)
problematic_data

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrig,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,day_of_month,amount_range,diffbalanceOrig,diffbalanceDest,prediction
6362522,735,TRANSFER,92735.710938,C1351323617,92735.710938,0.0,C413722554,0.0,0.0,fraud,31,10.000-100.000,-92735.710938,0.0,0.0
6362523,735,CASH_OUT,92735.710938,C786761311,92735.710938,0.0,C570188819,921583.3,1014319.0,fraud,31,10.000-100.000,-92735.710938,92735.6875,0.0
6362526,736,TRANSFER,53315.308594,C914459666,53315.308594,0.0,C73348302,0.0,0.0,fraud,31,10.000-100.000,-53315.308594,0.0,0.0
6362527,736,CASH_OUT,53315.308594,C810591075,53315.308594,0.0,C1614619549,0.0,53315.31,fraud,31,10.000-100.000,-53315.308594,53315.308594,0.0
6362566,739,TRANSFER,8116.529785,C2072430566,8116.529785,0.0,C765215468,0.0,0.0,fraud,31,1.000-10.000,-8116.529785,0.0,0.0
6362567,739,CASH_OUT,8116.529785,C564539602,8116.529785,0.0,C1935865739,7638.26,15754.79,fraud,31,1.000-10.000,-8116.529785,8116.530273,0.0
6362586,741,TRANSFER,62297.179688,C1565612524,62297.179688,0.0,C435734396,0.0,0.0,fraud,31,10.000-100.000,-62297.179688,0.0,0.0
6362587,741,CASH_OUT,62297.179688,C1597957921,62297.179688,0.0,C734451013,1060553.0,1122850.0,fraud,31,10.000-100.000,-62297.179688,62297.125,0.0
6362596,741,TRANSFER,48442.878906,C1112979339,48442.878906,0.0,C2114078084,0.0,0.0,fraud,31,10.000-100.000,-48442.878906,0.0,0.0
6362597,741,CASH_OUT,48442.878906,C1706094385,48442.878906,0.0,C2109905271,513746.2,562189.1,fraud,31,10.000-100.000,-48442.878906,48442.875,0.0


In [10]:
problematic_data.value_counts()

step  type      amount        nameOrig     oldbalanceOrig  newbalanceOrig  nameDest     oldbalanceDest  newbalanceDest  isFraud  day_of_month  amount_range    diffbalanceOrig  diffbalanceDest  prediction
735   CASH_OUT  92735.710938  C786761311   92735.710938    0.0             C570188819   9.215833e+05    1.014319e+06    fraud    31            10.000-100.000  -92735.710938    92735.687500     0.0           1
      TRANSFER  92735.710938  C1351323617  92735.710938    0.0             C413722554   0.000000e+00    0.000000e+00    fraud    31            10.000-100.000  -92735.710938    0.000000         0.0           1
736   CASH_OUT  53315.308594  C810591075   53315.308594    0.0             C1614619549  0.000000e+00    5.331531e+04    fraud    31            10.000-100.000  -53315.308594    53315.308594     0.0           1
      TRANSFER  53315.308594  C914459666   53315.308594    0.0             C73348302    0.000000e+00    0.000000e+00    fraud    31            10.000-100.000  -53315.308

In [11]:
print('prediction' in ref_data.columns)
print('prediction' in problematic_data.columns)


True
True


In [12]:
test_suite = TestSuite(tests = [DataDriftTestPreset()])
test_suite.run(reference_data=ref_data, current_data=problematic_data, column_mapping=column_mapping)

  c /= stddev[:, None]
  c /= stddev[None, :]
  terms = (f_obs_float - f_exp)**2 / f_exp
  terms = (f_obs_float - f_exp)**2 / f_exp
  terms = (f_obs_float - f_exp)**2 / f_exp
  terms = (f_obs_float - f_exp)**2 / f_exp


In [13]:
test_suite.show(mode='inline')

In [None]:
report = Report(metrics = [DataDriftPreset()])
report.run(reference_data=ref_data, current_data=problematic_data, column_mapping=column_mapping)

In [None]:
report.show(mode='inline')