# Data drift dashboard in jupyter notebook

## Imports

In [23]:
import io
import numpy as np
import os
import pandas as pd
from pathlib import Path
import requests
import zipfile

from datetime import datetime

from evidently.pipeline.column_mapping import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
from evidently.metric_preset import TargetDriftPreset
from evidently.metric_preset import DataQualityPreset
from evidently.metric_preset.regression_performance import RegressionPreset 
from evidently.metric_preset import ClassificationPreset
from evidently.metrics.custom_metric import CustomValueMetric
from evidently.renderers.html_widgets import WidgetSize
from evidently.metrics import (
    RegressionQualityMetric,
    RegressionPredictedVsActualScatter,
    RegressionPredictedVsActualPlot,
    RegressionErrorPlot,
    RegressionAbsPercentageErrorPlot,
    RegressionErrorDistribution,
    RegressionErrorNormality,
    RegressionTopErrorMetric,
    RegressionErrorBiasTable,
    DatasetSummaryMetric,
    ColumnSummaryMetric,
    DatasetMissingValuesMetric,
    DatasetCorrelationsMetric    
)
from evidently.metrics import ClassificationQualityMetric, ClassificationClassBalance, ClassificationConfusionMatrix, ClassificationProbDistribution, ClassificationPRCurve, ClassificationPRTable

from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn import datasets, ensemble

In [24]:
# Customize color scheme for evidently report
from evidently.options import ColorOptions

color_scheme = ColorOptions(
    primary_color="#bc4fd8",
    fill_color="#cb5ee6",
    zero_line_color="#016795",
    current_data_color="#cb5ee6",
    reference_data_color="#c0b7c2"
)

## Credit Score Data

### Load data

In [25]:
raw_data = pd.read_excel("data/DB.xlsx", sheet_name="credit_scoring_dataset")

In [26]:
# Shorter column name
raw_data = raw_data.rename(columns={'default_payment_next_month': 'default'})

In [27]:
raw_data.head()

Unnamed: 0,MODEL_CODE,APP_DATE,ID,target_prediction,default_payment_next_month_GRADE,target_gt,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,...,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,SEX,Unnamed_ 0,default,target_gt.1
0,M0001,2022-01-01,17987,0.585299,7,0,38,6333,3360,1291,...,0,10,2740,2241,0,2731,1,17986,0,0
1,M0001,2022-01-01,9761,0.028572,2,0,29,233717,87171,87786,...,10004,3066,3171,3501,3030,2789,2,9760,0,0
2,M0001,2022-01-01,29613,0.382736,6,1,47,390,780,390,...,780,0,0,780,0,0,1,29612,1,1
3,M0001,2022-01-01,19638,0.853776,8,1,49,48297,50487,51400,...,3000,2400,0,0,0,0,2,19637,1,1
4,M0001,2022-01-01,22373,0.015585,1,0,27,101489,78738,78639,...,10006,8000,1610,10000,1333,8818,2,22372,0,0


## Classification Model

### Config

In [28]:
REF_MONTH_START = '2022-01-01'
REF_MONTH_END = '2022-02-28'

CUR_MONTH_START = '2024-02-01'
CUR_MONTH_END = '2024-02-29'

target = 'target_gt'
prediction = 'target_prediction'
numerical_features = [
    'AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 
    'BILL_AMT5', 'BILL_AMT6', 'LIMIT_BAL', 'PAY_0', 
    'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 
    'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 
    'PAY_AMT6', 'Unnamed_0', 'default'
]
categorical_features = ['EDUCATION', 'MARRIAGE', 'SEX']

reports_dir = Path('reports') / f'{CUR_MONTH_START}_{CUR_MONTH_END}'
reports_dir.mkdir(exist_ok=True)

In [29]:
reference = raw_data[(raw_data['APP_DATE'] >= REF_MONTH_START) & (raw_data['APP_DATE'] <= REF_MONTH_END)]
current = raw_data[(raw_data['APP_DATE'] >= CUR_MONTH_START) & (raw_data['APP_DATE'] <= CUR_MONTH_END)]


In [30]:
reference

Unnamed: 0,MODEL_CODE,APP_DATE,ID,target_prediction,default_payment_next_month_GRADE,target_gt,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,...,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,SEX,Unnamed_ 0,default,target_gt.1
0,M0001,2022-01-01,17987,0.585299,7,0,38,6333,3360,1291,...,0,10,2740,2241,0,2731,1,17986,0,0
1,M0001,2022-01-01,9761,0.028572,2,0,29,233717,87171,87786,...,10004,3066,3171,3501,3030,2789,2,9760,0,0
2,M0001,2022-01-01,29613,0.382736,6,1,47,390,780,390,...,780,0,0,780,0,0,1,29612,1,1
3,M0001,2022-01-01,19638,0.853776,8,1,49,48297,50487,51400,...,3000,2400,0,0,0,0,2,19637,1,1
4,M0001,2022-01-01,22373,0.015585,1,0,27,101489,78738,78639,...,10006,8000,1610,10000,1333,8818,2,22372,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,M0001,2022-02-28,6025,0.189376,5,0,29,1767,-3355,-3987,...,0,0,2000,4000,3000,5000,2,6024,0,0
3996,M0001,2022-02-28,9290,0.579722,6,1,30,105348,106432,102996,...,4268,3251,2900,5000,0,105300,1,9289,1,1
3997,M0001,2022-02-28,3330,0.173870,5,0,35,4050,4914,6630,...,1073,1789,0,3425,1000,3425,2,3329,0,0
3998,M0001,2022-02-28,28772,0.015577,1,0,38,2410,7830,4024,...,7954,4044,6715,1956,2076,2003,2,28771,0,0


# Model Monitoring

In [31]:
column_mapping = ColumnMapping()

column_mapping.target = 'target_gt'
column_mapping.prediction = 'target_prediction'
column_mapping.numerical_features = numerical_features
column_mapping.categorical_features = categorical_features

In [32]:
reference.head()

Unnamed: 0,MODEL_CODE,APP_DATE,ID,target_prediction,default_payment_next_month_GRADE,target_gt,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,...,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,SEX,Unnamed_ 0,default,target_gt.1
0,M0001,2022-01-01,17987,0.585299,7,0,38,6333,3360,1291,...,0,10,2740,2241,0,2731,1,17986,0,0
1,M0001,2022-01-01,9761,0.028572,2,0,29,233717,87171,87786,...,10004,3066,3171,3501,3030,2789,2,9760,0,0
2,M0001,2022-01-01,29613,0.382736,6,1,47,390,780,390,...,780,0,0,780,0,0,1,29612,1,1
3,M0001,2022-01-01,19638,0.853776,8,1,49,48297,50487,51400,...,3000,2400,0,0,0,0,2,19637,1,1
4,M0001,2022-01-01,22373,0.015585,1,0,27,101489,78738,78639,...,10006,8000,1610,10000,1333,8818,2,22372,0,0


## Model performance

In [33]:
def gini_func(data): 
    roc_auc_current = roc_auc_score(reference[target], reference[prediction])
    gini_current = 2 * roc_auc_current - 1
    return gini_current

def auc(data): 
    roc_auc_current = roc_auc_score(reference[target], reference[prediction])
    return roc_auc_current


In [34]:
gini_func(raw_data)

0.5457421054524725

In [35]:
regression_performance_report = Report(metrics=[ClassificationQualityMetric(), 
                                                CustomValueMetric(func=gini_func, title="Gini"), 
                                                ClassificationClassBalance(), ClassificationConfusionMatrix(), 
                                                ClassificationProbDistribution(), ClassificationPRCurve(),
                                                ClassificationPRTable()], options=[color_scheme])
regression_performance_report.run(
    reference_data=reference,
    current_data=current,
    column_mapping=column_mapping
)

# Calculate ROC-AUC and Gini for reference data
roc_auc_reference = roc_auc_score(reference[target], reference[prediction])
gini_reference = 2 * roc_auc_reference - 1

# Calculate ROC-AUC and Gini for current data
roc_auc_current = roc_auc_score(current[target], current[prediction])
gini_current = 2 * roc_auc_current - 1

In [36]:
model_performance_report_path = str(reports_dir / '2. model_performance.html')
regression_performance_report.save_html(model_performance_report_path)

##  Target drift

In [37]:
target_drift_report = Report(metrics=[TargetDriftPreset()], options=[color_scheme])
target_drift_report.run(
    reference_data=reference,
    current_data=current,
    column_mapping=column_mapping
)

In [38]:
target_drift_report_path = str(reports_dir / '4. target_drift.html')
target_drift_report.save_html(target_drift_report_path)

## Data drift

In [39]:
column_mapping = ColumnMapping()
column_mapping.numerical_features = numerical_features

In [40]:
data_drift_report = Report(metrics=[DataDriftPreset()], options=[color_scheme])
data_drift_report.run(
    reference_data=reference,
    current_data=current,
    column_mapping=column_mapping
)

In [41]:
data_drift_report_path = str(reports_dir / '3. data_drift.html')
data_drift_report.save_html(data_drift_report_path)

## Data quality

In [42]:
column_mapping = ColumnMapping()
column_mapping.numerical_features = numerical_features

In [43]:
data_quality_report = Report(metrics=[DataQualityPreset()], options=[color_scheme])
data_quality_report.run(
    reference_data=reference,
    current_data=current,
    column_mapping=column_mapping
)

In [44]:
data_quality_report_path = str(reports_dir / '1. data_quality.html')
data_quality_report.save_html(data_quality_report_path)