In [16]:
import pandas as pd
import numpy as np
import random
import sklearn.datasets as datasets
import sys

sys.path.insert(1, '..')
from check_data_consistency import DataConsistencyChecker

In [17]:
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = 1000
pd.options.display.max_rows = 1000
pd.options.display.width = 10000

In [18]:
data = datasets.load_boston()
df = pd.DataFrame(data.data, columns=data.feature_names)
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [19]:
dc = DataConsistencyChecker() 
dc.init_data(df)

## Run the SIMILAR_PREVIOUS test

In [20]:
# Run a small set of tests to start. In this example, we run a single test.

_ = dc.check_data_quality(execute_list=['SIMILAR_PREVIOUS'])

Executing test  16: SIMILAR_PREVIOUS              

Data consistency check complete.
Analysed 506 rows, 13 columns
Executed 1 tests.

Patterns without Exceptions:
Found 1 patterns without exceptions
1 tests (100.00% of tests) identified at least one pattern without exceptions each. 
By default some patterns are not listed in calls to display_detailed_results().

Patterns with Exceptions:
Found 1 patterns with exceptions
1 tests (100.00% of tests) flagged at least one exception each.
Flagged 2 row(s) with at least one exception.
Flagged 1 column(s) with at least one exception.


In [21]:
# In the next few cells, we look at the output of the tests. 

dc.summarize_patterns_and_exceptions()

Unnamed: 0,Test ID,Number Patterns without Exceptions,Number Patterns with Exceptions
0,SIMILAR_PREVIOUS,1,1


In [22]:
dc.get_patterns_list()

Unnamed: 0,Test ID,Column(s),Description of Pattern,Pattern ID
0,SIMILAR_PREVIOUS,TAX,"The values in ""TAX"" are consistently similar to the previous value, more so than they are similar to the median value of the column (330.0)",0


In [23]:
dc.get_exceptions_list()

Unnamed: 0,Test ID,Column(s),Description of Pattern,Number of Exceptions,Issue ID
0,SIMILAR_PREVIOUS,DIS,"The values in ""DIS"" are consistently similar to the previous value, more so than they are similar to the median value of the column (3.2074499999999997), with exceptions.",2,0


## Run the RARE_DECIMALS test

In [24]:
# Here we run a second test on the same dataset. 

_ = dc.check_data_quality(execute_list=['RARE_DECIMALS'])

Executing test  11: RARE_DECIMALS                 

Data consistency check complete.
Analysed 506 rows, 13 columns
Executed 1 tests.

Patterns without Exceptions:
Found 1 patterns without exceptions
1 tests (100.00% of tests) identified at least one pattern without exceptions each. 
By default some patterns are not listed in calls to display_detailed_results().

Patterns with Exceptions:
Found 0 patterns with exceptions
0 tests (0.00% of tests) flagged at least one exception each.
Flagged 0 row(s) with at least one exception.
Flagged 0 column(s) with at least one exception.


In [25]:
# By default, calling check_data_quality() again will start fresh, which
# is useful where you have a new set of tests you wish to execute. We
# see in the next few cells that the RARE_DECIMALS test found no patterns
# and one exception. These are shown, but the results of SIMILAR_PREVIOUS
# are removed. 

dc.summarize_patterns_and_exceptions()

Unnamed: 0,Test ID,Number Patterns without Exceptions,Number Patterns with Exceptions
0,RARE_DECIMALS,1,


In [26]:
dc.get_patterns_list()

Unnamed: 0,Test ID,Column(s),Description of Pattern,Pattern ID
0,RARE_DECIMALS,ZN,"The column consistently contains values with one of '', '5' after the decimal point",0


In [27]:
dc.get_exceptions_list()

Unnamed: 0,Test ID,Column(s),Description of Pattern,Number of Exceptions,Issue ID


## Run both tests again

In [28]:
# We repeat the above example, this time combining the results of all tests.

_ = dc.check_data_quality(execute_list=['SIMILAR_PREVIOUS'])

Executing test  16: SIMILAR_PREVIOUS              

Data consistency check complete.
Analysed 506 rows, 13 columns
Executed 1 tests.

Patterns without Exceptions:
Found 1 patterns without exceptions
1 tests (100.00% of tests) identified at least one pattern without exceptions each. 
By default some patterns are not listed in calls to display_detailed_results().

Patterns with Exceptions:
Found 1 patterns with exceptions
1 tests (100.00% of tests) flagged at least one exception each.
Flagged 2 row(s) with at least one exception.
Flagged 1 column(s) with at least one exception.


In [29]:
dc.summarize_patterns_and_exceptions()

Unnamed: 0,Test ID,Number Patterns without Exceptions,Number Patterns with Exceptions
0,SIMILAR_PREVIOUS,1,1


In [30]:
# Here we set append_results to True, which saves the previous findings, and 
# appends any results from the current set of tests. 

_ = dc.check_data_quality(execute_list=['RARE_DECIMALS'], append_results=True)

Executing test  11: RARE_DECIMALS                 

Data consistency check complete.
Analysed 506 rows, 13 columns
Executed 1 tests.

Patterns without Exceptions:
Found 2 patterns without exceptions
2 tests (200.00% of tests) identified at least one pattern without exceptions each. 
By default some patterns are not listed in calls to display_detailed_results().

Patterns with Exceptions:
Found 1 patterns with exceptions
1 tests (100.00% of tests) flagged at least one exception each.
Flagged 2 row(s) with at least one exception.
Flagged 1 column(s) with at least one exception.


In [31]:
dc.summarize_patterns_and_exceptions()

Unnamed: 0,Test ID,Number Patterns without Exceptions,Number Patterns with Exceptions
0,RARE_DECIMALS,1,
1,SIMILAR_PREVIOUS,1,1.0


In [33]:
dc.get_patterns_list()

Unnamed: 0,Test ID,Column(s),Description of Pattern,Pattern ID
0,SIMILAR_PREVIOUS,TAX,"The values in ""TAX"" are consistently similar to the previous value, more so than they are similar to the median value of the column (330.0)",0
1,RARE_DECIMALS,ZN,"The column consistently contains values with one of '', '5' after the decimal point",1
