# Data drift dashboard in jupyter notebook

## Imports

In [373]:
import io
import numpy as np
import os
import pandas as pd
from pathlib import Path
import requests
import zipfile

from datetime import datetime

from evidently.pipeline.column_mapping import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
from evidently.metric_preset import TargetDriftPreset
from evidently.metric_preset import DataQualityPreset
from evidently.metric_preset.regression_performance import RegressionPreset 
from evidently.metric_preset import ClassificationPreset
from evidently.metrics.custom_metric import CustomValueMetric
from evidently.renderers.html_widgets import WidgetSize
from evidently.metrics import (
    RegressionQualityMetric,
    RegressionPredictedVsActualScatter,
    RegressionPredictedVsActualPlot,
    RegressionErrorPlot,
    RegressionAbsPercentageErrorPlot,
    RegressionErrorDistribution,
    RegressionErrorNormality,
    RegressionTopErrorMetric,
    RegressionErrorBiasTable,
    DatasetSummaryMetric,
    ColumnSummaryMetric,
    DatasetMissingValuesMetric,
    DatasetCorrelationsMetric    
)

from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn import datasets, ensemble

In [374]:
# Customize color scheme for evidently report
from evidently.options import ColorOptions

color_scheme = ColorOptions(
    primary_color="#bc4fd8",
    fill_color="#cb5ee6",
    zero_line_color="#016795",
    current_data_color="#cb5ee6",
    reference_data_color="#c0b7c2"
)

## Credit Score Data

### Load data

In [344]:
raw_data = pd.read_excel("data/credit_scoring_dataset.xlsx", sheet_name="credit_scoring_dataset")

In [345]:
# Renaming the column
raw_data = raw_data.rename(columns={'person_age': 'age'})
raw_data = raw_data.rename(columns={'person_income': 'income'})
raw_data = raw_data.rename(columns={'person_home_ownership': 'home'})
raw_data = raw_data.rename(columns={'person_emp_length': 'emp_length'})
raw_data = raw_data.rename(columns={'cb_person_default_on_file': 'default'})
raw_data = raw_data.rename(columns={'cb_person_cred_hist_length': 'hist_length'})

In [346]:
raw_data.head()

Unnamed: 0,customer_id,date,age,income,home,emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,default,hist_length
0,ca1b3907-5b30-4959-b15a-5bb0b4e4ceaa,2024-01-01,24,75000,RENT,4.0,EDUCATION,B,25000,10.25,1,N,4
1,664be97b-1998-4f86-ab0e-cb1f14fc6542,2024-01-01,23,22800,MORTGAGE,3.0,PERSONAL,A,4800,5.79,0,N,2
2,e696b465-1e26-4051-acbb-80446d317faa,2024-01-01,25,38000,RENT,8.0,MEDICAL,C,16000,12.87,1,Y,3
3,012f962f-e7b5-4815-a3f7-18042e90ce17,2024-01-01,23,15600,RENT,0.0,VENTURE,D,1200,14.91,1,N,4
4,6b6c1ce5-b89a-432c-b7c2-8f6eda450547,2024-01-01,25,16800,RENT,4.0,MEDICAL,B,1200,10.38,0,N,2


### Cleaning

In [347]:
raw_data.shape

(32579, 13)

In [348]:
raw_data.isnull().sum()

customer_id         0
date                0
age                 0
income              0
home                0
emp_length        895
loan_intent         0
loan_grade          0
loan_amnt           0
loan_int_rate    3116
loan_status         0
default             0
hist_length         0
dtype: int64

In [349]:
# Fill NA for person_emp_length with mode
mode_value = raw_data['emp_length'].mode()[0]  # Get the mode of the column
raw_data['emp_length'] = raw_data['emp_length'].fillna(mode_value)


In [350]:
# Fill NA for loan_int_rate with mean
mean_value = raw_data['loan_int_rate'].mean()  # Get the mode of the column
raw_data['loan_int_rate'] = raw_data['loan_int_rate'].fillna(mean_value)

## Classification Model

### Config

In [351]:
REF_MONTH_START = '2024-01-01'
REF_MONTH_END = '2024-04-30'

CUR_MONTH_START = '2024-05-01'
CUR_MONTH_END = '2024-05-31'

CUR_WEEK_START = '2024-05-27'
CUR_WEEK_END = '2024-05-31'

target = 'default'
prediction = 'prediction'
numerical_features = ['age', 'income', 'emp_length', 'loan_amnt', 'loan_int_rate', 'hist_length']
categorical_features = ['home', 'loan_intent', 'loan_grade', 'loan_status']

reports_dir = Path('reports') / f'{CUR_WEEK_START}_{CUR_WEEK_END}'
reports_dir.mkdir(exist_ok=True)

### Model training

##### Encoding categorical columns

In [352]:
one_hot_features = ['home', 'loan_intent', 'loan_status']
ordinal_features = ['loan_grade']

# Initialize the encoders
one_hot_encoder = OneHotEncoder(drop='first', sparse_output=False)
ordinal_encoder = OrdinalEncoder(categories=[['A', 'B', 'C', 'D', 'E', 'F', 'G']])  # Adjust categories based on your data

# One-hot encode the selected features
one_hot_encoded = one_hot_encoder.fit_transform(raw_data[one_hot_features])
one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names_out(one_hot_features))

# Ordinal encode the selected feature starting from 1 to 7
ordinal_encoded = ordinal_encoder.fit_transform(raw_data[ordinal_features]) + 1
ordinal_encoded_df = pd.DataFrame(ordinal_encoded, columns=ordinal_features)

# Drop the original columns and concatenate the new encoded columns
raw_data = raw_data.drop(columns=one_hot_features + ordinal_features)
raw_data = pd.concat([raw_data, one_hot_encoded_df, ordinal_encoded_df], axis=1)

# Encode the target column to 0/1
raw_data[target] = raw_data[target].map({'Y': 1, 'N': 0})

# Display the transformed DataFrame
print(raw_data.head())

                            customer_id       date  age  income  emp_length  \
0  ca1b3907-5b30-4959-b15a-5bb0b4e4ceaa 2024-01-01   24   75000         4.0   
1  664be97b-1998-4f86-ab0e-cb1f14fc6542 2024-01-01   23   22800         3.0   
2  e696b465-1e26-4051-acbb-80446d317faa 2024-01-01   25   38000         8.0   
3  012f962f-e7b5-4815-a3f7-18042e90ce17 2024-01-01   23   15600         0.0   
4  6b6c1ce5-b89a-432c-b7c2-8f6eda450547 2024-01-01   25   16800         4.0   

   loan_amnt  loan_int_rate  default  hist_length  home_OTHER  home_OWN  \
0      25000          10.25        0            4         0.0       0.0   
1       4800           5.79        0            2         0.0       0.0   
2      16000          12.87        1            3         0.0       0.0   
3       1200          14.91        0            4         0.0       0.0   
4       1200          10.38        0            2         0.0       0.0   

   home_RENT  loan_intent_EDUCATION  loan_intent_HOMEIMPROVEMENT  \
0     

In [353]:
categorical_features = list(one_hot_encoded_df.columns) + list(ordinal_encoded_df.columns)

In [354]:
reference = raw_data[(raw_data['date'] >= REF_MONTH_START) & (raw_data['date'] <= REF_MONTH_END)]
current = raw_data[(raw_data['date'] >= CUR_MONTH_START) & (raw_data['date'] <= CUR_WEEK_END)]


In [355]:
raw_data.head()

Unnamed: 0,customer_id,date,age,income,emp_length,loan_amnt,loan_int_rate,default,hist_length,home_OTHER,home_OWN,home_RENT,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_status_1,loan_grade
0,ca1b3907-5b30-4959-b15a-5bb0b4e4ceaa,2024-01-01,24,75000,4.0,25000,10.25,0,4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0
1,664be97b-1998-4f86-ab0e-cb1f14fc6542,2024-01-01,23,22800,3.0,4800,5.79,0,2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,e696b465-1e26-4051-acbb-80446d317faa,2024-01-01,25,38000,8.0,16000,12.87,1,3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0
3,012f962f-e7b5-4815-a3f7-18042e90ce17,2024-01-01,23,15600,0.0,1200,14.91,0,4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,4.0
4,6b6c1ce5-b89a-432c-b7c2-8f6eda450547,2024-01-01,25,16800,4.0,1200,10.38,0,2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0


In [356]:
# Create an instance of the logistic regression model
model = XGBClassifier()

In [357]:
model.fit(reference[numerical_features + categorical_features], reference[target])

In [358]:
ref_prediction = model.predict(reference[numerical_features + categorical_features])
current_prediction = model.predict(current[numerical_features + categorical_features])

In [359]:
reference['prediction'] = ref_prediction
current['prediction'] = current_prediction

# Model Monitoring

In [360]:
column_mapping = ColumnMapping()

column_mapping.target = 'default'
column_mapping.prediction = 'prediction'
column_mapping.numerical_features = numerical_features
column_mapping.categorical_features = categorical_features

In [361]:
reference.head()

Unnamed: 0,customer_id,date,age,income,emp_length,loan_amnt,loan_int_rate,default,hist_length,home_OTHER,home_OWN,home_RENT,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_status_1,loan_grade,prediction
0,ca1b3907-5b30-4959-b15a-5bb0b4e4ceaa,2024-01-01,24,75000,4.0,25000,10.25,0,4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,0
1,664be97b-1998-4f86-ab0e-cb1f14fc6542,2024-01-01,23,22800,3.0,4800,5.79,0,2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0
2,e696b465-1e26-4051-acbb-80446d317faa,2024-01-01,25,38000,8.0,16000,12.87,1,3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0,1
3,012f962f-e7b5-4815-a3f7-18042e90ce17,2024-01-01,23,15600,0.0,1200,14.91,0,4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,4.0,0
4,6b6c1ce5-b89a-432c-b7c2-8f6eda450547,2024-01-01,25,16800,4.0,1200,10.38,0,2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0


## Model performance

In [362]:
def gini_func(data): 
    roc_auc_current = roc_auc_score(reference[target], reference[prediction])
    gini_current = 2 * roc_auc_current - 1
    return gini_current

def auc(data): 
    roc_auc_current = roc_auc_score(reference[target], reference[prediction])
    return roc_auc_current


In [363]:
regression_performance_report = Report(metrics=[CustomValueMetric(func=gini_func, title="Gini", size=WidgetSize.HALF), 
                                                CustomValueMetric(func=auc, title="AUC", size=WidgetSize.HALF),
                                                ClassificationPreset()], options=[color_scheme])

regression_performance_report.run(
    reference_data=reference,
    current_data=current,
    column_mapping=column_mapping
)

# Calculate ROC-AUC and Gini for reference data
roc_auc_reference = roc_auc_score(reference[target], reference[prediction])
gini_reference = 2 * roc_auc_reference - 1

# Calculate ROC-AUC and Gini for current data
roc_auc_current = roc_auc_score(current[target], current[prediction])
gini_current = 2 * roc_auc_current - 1

In [364]:
model_performance_report_path = str(reports_dir / 'model_performance.html')
regression_performance_report.save_html(model_performance_report_path)

##  Target drift

In [365]:
target_drift_report = Report(metrics=[TargetDriftPreset()], options=[color_scheme])
target_drift_report.run(
    reference_data=reference,
    current_data=current,
    column_mapping=column_mapping
)

In [366]:
target_drift_report_path = str(reports_dir / 'target_drift.html')
target_drift_report.save_html(target_drift_report_path)

## Data drift

In [367]:
column_mapping = ColumnMapping()
column_mapping.numerical_features = numerical_features

In [368]:
data_drift_report = Report(metrics=[DataDriftPreset(columns=["hist_length", "age", 
                                                             "income", "emp_length", "loan_amnt", "loan_int_rate"])], 
                                                             options=[color_scheme])
data_drift_report.run(
    reference_data=reference,
    current_data=current,
    column_mapping=column_mapping
)

In [369]:
data_drift_report_path = str(reports_dir / 'data_drift.html')
data_drift_report.save_html(data_drift_report_path)

## Data quality

In [370]:
column_mapping = ColumnMapping()
column_mapping.numerical_features = numerical_features

In [371]:
data_quality_report = Report(metrics=[DataQualityPreset(columns=["prediction","hist_length", "age", 
                                                             "income", "emp_length", "loan_amnt", "loan_int_rate", "default"])], options=[color_scheme])
data_quality_report.run(
    reference_data=reference,
    current_data=current,
    column_mapping=column_mapping
)

In [372]:
data_quality_report_path = str(reports_dir / 'data_quality.html')
data_quality_report.save_html(data_quality_report_path)