In [9]:
import pandas as pd
from sklearn.datasets import fetch_openml
import sys

sys.path.insert(0, "..")
from check_data_consistency import DataConsistencyChecker

In [10]:
# This notebook provides examples of clearing & restoring results.

In [11]:
# Load in the data

data = fetch_openml('hypothyroid', version=1)
data_df = pd.DataFrame(data.data, columns=data.feature_names)
display(data_df.head())

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source
0,41.0,F,f,f,f,f,f,f,f,f,...,2.5,t,125.0,t,1.14,t,109.0,f,,SVHC
1,23.0,F,f,f,f,f,f,f,f,f,...,2.0,t,102.0,f,,f,,f,,other
2,46.0,M,f,f,f,f,f,f,f,f,...,,t,109.0,t,0.91,t,120.0,f,,other
3,70.0,F,t,f,f,f,f,f,f,f,...,1.9,t,175.0,f,,f,,f,,other
4,70.0,F,f,f,f,f,f,f,f,f,...,1.2,t,61.0,t,0.87,t,70.0,f,,SVI


In [12]:
dc = DataConsistencyChecker(verbose=0) 
dc.init_data(data_df)

# This dataset is quick to test anyway, but we specify fast_only in order 
# to reduce the results found, to make the example simpler. 
dc.check_data_quality(fast_only=True) 

# Get a view of the patterns and exceptions found
ret = dc.summarize_patterns_and_exceptions()
display(ret)


Data consistency check complete.
Analysed 3,772 rows, 27 columns
Executed 70 tests.

Patterns without Exceptions:
Found 20 patterns without exceptions
7 tests (10.00% of tests) identified at least one pattern without exceptions each. 
By default some patterns are not listed in calls to display_detailed_results().

Patterns with Exceptions:
Found 11 patterns with exceptions
5 tests (7.14% of tests) flagged at least one exception each.
Flagged 69 row(s) with at least one exception.
Flagged 4 column(s) with at least one exception.


Unnamed: 0,Test ID,Number Patterns without Exceptions,Number Patterns with Exceptions
0,POSITIVE,6.0,
1,NUMBER_DECIMALS,2.0,
2,RARE_DECIMALS,,2.0
3,UNUSUAL_ORDER_MAGNITUDE,,3.0
4,FEW_NEIGHBORS,,1.0
5,VERY_SMALL,,2.0
6,VERY_LARGE,,3.0
7,NON_ZERO,6.0,
8,GREATER_THAN_ONE,3.0,
9,FIRST_CHAR_ALPHA,1.0,


In [13]:
# After some examination, it may be decided that the 'UNIQUE_VALUES_PER_ROW', 
# and 'NEGATIVE_VALUES_PER_ROW' tests were not relevant, or will be handled
# elsewhere, so we can clear these.

dc.clear_results(test_id_list=['UNIQUE_VALUES_PER_ROW', 'NEGATIVE_VALUES_PER_ROW'])

# Examine the new set of results with these test results removed
ret = dc.summarize_patterns_and_exceptions()
display(ret)

Unnamed: 0,Test ID,Number Patterns without Exceptions,Number Patterns with Exceptions
0,POSITIVE,6.0,
1,NUMBER_DECIMALS,2.0,
2,RARE_DECIMALS,,2.0
3,UNUSUAL_ORDER_MAGNITUDE,,3.0
4,FEW_NEIGHBORS,,1.0
5,VERY_SMALL,,2.0
6,VERY_LARGE,,3.0
7,NON_ZERO,6.0,
8,GREATER_THAN_ONE,3.0,
9,FIRST_CHAR_ALPHA,1.0,


In [14]:
# Calling summarize_patterns_by_test_and_feature(), we
# can see which features were involved with the results.

ret = dc.summarize_patterns_by_test_and_feature()
display(ret)

Unnamed: 0_level_0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,referral_source
Test ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
POSITIVE,✔,,,,,,,,,,...,✔,,✔,,✔,,✔,,✔,
NUMBER_DECIMALS,,,,,,,,,,,...,,,✔,,,,✔,,,
NON_ZERO,✔,,,,,,,,,,...,✔,,✔,,✔,,✔,,✔,
GREATER_THAN_ONE,✔,,,,,,,,,,...,,,,,✔,,,,✔,
FIRST_CHAR_ALPHA,,,,,,,,,,,...,,,,,,,,,,✔


In [15]:
# We may determine that the results related to 2 columns may be cleared. 

dc.clear_results(col_name_list=['T4U_measured', 'age'])

# Examine the set of features for each pattern again with these removed. 
ret = dc.summarize_patterns_by_test_and_feature()
display(ret)

Unnamed: 0_level_0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,referral_source
Test ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
POSITIVE,,,,,,,,,,,...,✔,,✔,,✔,,✔,,✔,
NUMBER_DECIMALS,,,,,,,,,,,...,,,✔,,,,✔,,,
NON_ZERO,,,,,,,,,,,...,✔,,✔,,✔,,✔,,✔,
GREATER_THAN_ONE,,,,,,,,,,,...,,,,,✔,,,,✔,
FIRST_CHAR_ALPHA,,,,,,,,,,,...,,,,,,,,,,✔


In [16]:
# We can look at the exceptions as well. Calling get_exceptions_summary(),
# we get a list of the issues found.

ret = dc.get_exceptions_list()
display(ret)

Unnamed: 0,Test ID,Column(s),Description of Pattern,Number of Exceptions,Issue ID
0,RARE_DECIMALS,TT4,The column consistently contains values with o...,5,0
1,RARE_DECIMALS,FTI,The column consistently contains values with o...,7,1
3,UNUSUAL_ORDER_MAGNITUDE,TT4,This test checks for values of an unusual orde...,7,3
4,UNUSUAL_ORDER_MAGNITUDE,FTI,This test checks for values of an unusual orde...,8,4
5,FEW_NEIGHBORS,TT4,The test marked any values more than 42.800 aw...,1,5
6,VERY_SMALL,T4U,The test marked any values less than 0.3800000...,4,6
7,VERY_SMALL,FTI,The test marked any values less than 10.0 as v...,17,7
9,VERY_LARGE,TT4,The test marked any values larger than 250.00 ...,17,9
10,VERY_LARGE,T4U,The test marked any values larger than 1.78 as...,17,10


In [17]:
# We may determine that issues 3 and 4 may be cleared.

dc.clear_results(issue_id_list=[3, 4])

# Examine the results with these two issues removed. 
ret = dc.get_exceptions_list()
display(ret)

Unnamed: 0,Test ID,Column(s),Description of Pattern,Number of Exceptions,Issue ID
0,RARE_DECIMALS,TT4,The column consistently contains values with o...,5,0
1,RARE_DECIMALS,FTI,The column consistently contains values with o...,7,1
5,FEW_NEIGHBORS,TT4,The test marked any values more than 42.800 aw...,1,5
6,VERY_SMALL,T4U,The test marked any values less than 0.3800000...,4,6
7,VERY_SMALL,FTI,The test marked any values less than 10.0 as v...,17,7
9,VERY_LARGE,TT4,The test marked any values larger than 250.00 ...,17,9
10,VERY_LARGE,T4U,The test marked any values larger than 1.78 as...,17,10


In [18]:
# Again call summarize_patterns_and_exceptions() to see what is remaining. 

ret = dc.summarize_patterns_and_exceptions()
display(ret)

Unnamed: 0,Test ID,Number Patterns without Exceptions,Number Patterns with Exceptions
0,POSITIVE,5.0,
1,NUMBER_DECIMALS,2.0,
2,RARE_DECIMALS,,2.0
3,FEW_NEIGHBORS,,1.0
4,VERY_SMALL,,2.0
5,VERY_LARGE,,2.0
6,NON_ZERO,5.0,
7,GREATER_THAN_ONE,2.0,
8,FIRST_CHAR_ALPHA,1.0,


In [19]:
# We may choose to clear all patterns and leave only the exceptions. 

dc.clear_results(clear_all_patterns=True)

# Examine the remaining results. 
ret = dc.summarize_patterns_and_exceptions()
display(ret)

Unnamed: 0,Test ID,Number Patterns without Exceptions,Number Patterns with Exceptions
0,RARE_DECIMALS,,2
1,FEW_NEIGHBORS,,1
2,VERY_SMALL,,2
3,VERY_LARGE,,2


In [20]:
# Calling restore_results() will restore the state after the last call
# to check_data_quality()

dc.restore_results()

ret = dc.summarize_patterns_and_exceptions()
display(ret)

Unnamed: 0,Test ID,Number Patterns without Exceptions,Number Patterns with Exceptions
0,POSITIVE,6.0,
1,NUMBER_DECIMALS,2.0,
2,RARE_DECIMALS,,2.0
3,UNUSUAL_ORDER_MAGNITUDE,,3.0
4,FEW_NEIGHBORS,,1.0
5,VERY_SMALL,,2.0
6,VERY_LARGE,,3.0
7,NON_ZERO,6.0,
8,GREATER_THAN_ONE,3.0,
9,FIRST_CHAR_ALPHA,1.0,
