# Data drift with reports with evidently.ai

In [13]:
from evidently import ColumnMapping

from evidently.report import Report
from evidently.metrics.base_metric import generate_column_metrics
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset, DataQualityPreset, RegressionPreset
from evidently.metrics import *

from evidently.test_suite import TestSuite
from evidently.tests.base_test import generate_column_tests
from evidently.test_preset import DataStabilityTestPreset, NoTargetPerformanceTestPreset
from evidently.tests import *

import pandas as pd
import numpy as np

In [14]:
def get_raw_data():
    df_train = pd.read_csv('../data/train.csv')
    df_test = pd.read_csv('../data/test.csv')
    experiment_name = '1'
    
    return df_train, df_test

In [15]:
df_train, df_test = get_raw_data()

In [16]:
reference = df_train.copy()
current = df_test.copy()

In [17]:
np.random.seed(0)  # remove or change seed for different random values
current['Survived'] = np.random.randint(0, 2, size=len(current)).astype(int)
current.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1


In [8]:
reference.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Get data drift

In [18]:
report = Report(metrics=[
    DataDriftPreset(), 
])

report.run(reference_data=reference, current_data=current)

  terms = (f_obs_float - f_exp)**2 / f_exp
  terms = (f_obs_float - f_exp)**2 / f_exp
  terms = (f_obs_float - f_exp)**2 / f_exp
  terms = (f_obs_float - f_exp)**2 / f_exp
  terms = (f_obs_float - f_exp)**2 / f_exp
  terms = (f_obs_float - f_exp)**2 / f_exp


In [19]:
report.as_dict()

{'metrics': [{'metric': 'DatasetDriftMetric',
   'result': {'drift_share': 0.5,
    'number_of_columns': 12,
    'number_of_drifted_columns': 6,
    'share_of_drifted_columns': 0.5,
    'dataset_drift': True}},
  {'metric': 'DataDriftTable',
   'result': {'number_of_columns': 12,
    'number_of_drifted_columns': 6,
    'share_of_drifted_columns': 0.5,
    'dataset_drift': True,
    'drift_by_columns': {'Age': {'column_name': 'Age',
      'column_type': 'num',
      'stattest_name': 'K-S p_value',
      'stattest_threshold': 0.05,
      'drift_score': 0.7023427077378919,
      'drift_detected': False,
      'current': {'small_distribution': {'x': [0.17,
         7.753,
         15.336,
         22.919000000000004,
         30.502000000000002,
         38.085,
         45.668000000000006,
         53.251000000000005,
         60.834,
         68.417,
         76.0],
        'y': [0.006355370049365337,
         0.006355370049365337,
         0.028201954594058668,
         0.03852943092427

In [20]:
report.json()

'{"version": "0.4.40", "metrics": [{"metric": "DatasetDriftMetric", "result": {"drift_share": 0.5, "number_of_columns": 12, "number_of_drifted_columns": 6, "share_of_drifted_columns": 0.5, "dataset_drift": true}}, {"metric": "DataDriftTable", "result": {"number_of_columns": 12, "number_of_drifted_columns": 6, "share_of_drifted_columns": 0.5, "dataset_drift": true, "drift_by_columns": {"Age": {"column_name": "Age", "column_type": "num", "stattest_name": "K-S p_value", "stattest_threshold": 0.05, "drift_score": 0.7023427077378919, "drift_detected": false, "current": {"small_distribution": {"x": [0.17, 7.753, 15.336, 22.919000000000004, 30.502000000000002, 38.085, 45.668000000000006, 53.251000000000005, 60.834, 68.417, 76.0], "y": [0.006355370049365337, 0.006355370049365337, 0.028201954594058668, 0.03852943092427736, 0.017080057007669347, 0.01469679323915733, 0.00993026570213334, 0.006752580677450671, 0.0035748956527680027, 0.00039721062808533363]}}, "reference": {"small_distribution": {"

In [21]:
report.save_html('report.html')

# Get tests

In [22]:
tests = TestSuite(tests=[
    TestNumberOfColumnsWithMissingValues(),
    TestNumberOfRowsWithMissingValues(),
    TestNumberOfConstantColumns(),
    TestNumberOfDuplicatedRows(),
    TestNumberOfDuplicatedColumns(),
    TestColumnsType(),
    TestNumberOfDriftedColumns(),
])

tests.run(reference_data=reference, current_data=current)


divide by zero encountered in divide


divide by zero encountered in divide


divide by zero encountered in divide



In [23]:
tests.save_html('tests.html')

In [24]:
suite = TestSuite(tests=[
    NoTargetPerformanceTestPreset(),
])

suite.run(reference_data=reference, current_data=current)


divide by zero encountered in divide


divide by zero encountered in divide


divide by zero encountered in divide



In [25]:
suite.save_html('suite.html')