In [3]:
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport

In [4]:
# Get data from the link
csv_url =\
    'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'

data = pd.read_csv(csv_url, sep=';')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


## Simple inspection

In [5]:
# with pandas
data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1599.0,8.319637,1.741096,4.6,7.1,7.9,9.2,15.9
volatile acidity,1599.0,0.527821,0.17906,0.12,0.39,0.52,0.64,1.58
citric acid,1599.0,0.270976,0.194801,0.0,0.09,0.26,0.42,1.0
residual sugar,1599.0,2.538806,1.409928,0.9,1.9,2.2,2.6,15.5
chlorides,1599.0,0.087467,0.047065,0.012,0.07,0.079,0.09,0.611
free sulfur dioxide,1599.0,15.874922,10.460157,1.0,7.0,14.0,21.0,72.0
total sulfur dioxide,1599.0,46.467792,32.895324,6.0,22.0,38.0,62.0,289.0
density,1599.0,0.996747,0.001887,0.99007,0.9956,0.99675,0.997835,1.00369
pH,1599.0,3.311113,0.154386,2.74,3.21,3.31,3.4,4.01
sulphates,1599.0,0.658149,0.169507,0.33,0.55,0.62,0.73,2.0


In [None]:
# Generate the profile report with Pandas Profiling
profile = ProfileReport(
    data,
    title="Example of summarization of wine data"
)


In [None]:
# Generate the dataset profile
# This is a nice and simple way to document the data
profile.to_notebook_iframe()

## Unit tests

### Basic examples - function tests

We will learn how the unit test work on  a simple function. First, we will define a function `square`, which returns the square of a number. Then, we will test it by writing assertions (correct answers) in a test function. 

In [9]:

import pytest
# install the following to be able to run the tests in notebook
import ipytest
# import typeguard
ipytest.config()

{'rewrite_asserts': False,
 'magics': False,
 'clean': '[Tt]est*',
 'addopts': (),
 'run_in_thread': False,
 'defopts': 'auto',
 'display_columns': 100,
 'raise_on_error': False}

In [2]:
# A simple function: calculate square of a number
def square(x):
    return x * x

In [3]:
%%ipytest
# Let's test the function
# Are there any edge cases?
def test_square():
    assert square(2) == 4
    assert square(0) == 0
    assert square(-2) == 4

UsageError: Cell magic `%%ipytest` not found.


### Basic examples - data tests

As we did for the function, we can also write assertions for the data. In the following example we will define a data frame on the fly and thest for the null values in it. 

In [4]:
%%ipytest 

def test_column_is_null():
    df = pd.DataFrame(data = [(1, 0), (2, None)],
                      columns = ['a', 'b'])
    
    assert np.all(pd.notna(df))
    


UsageError: Cell magic `%%ipytest` not found.


## Test the wine data

Previously, we generated the data frame inside the test function. If we want to run multiple tests on the same df, we would rather pass it to each function as an argument (as usual in programming). To do that in testing, we need to define the data as **fixtures**. They look like ordinary function definitions, preceeded by a decorator `@pytest.fixture`. 

### Raw data tests

In [5]:
# Define fixtures
@pytest.fixture
def input_schema():
    # Define range and type for each column
    schema = {
    'fixed acidity': {'min': 1.0, 'max': 17.0, 'type': float},
    'volatile acidity': {'min': 0.0, 'max': 2.0, 'type': float},
    'citric acid': {'min': 0.0, 'max': 2.0, 'type': float},
    'residual sugar': {'min': 0.5, 'max': 17.0, 'type': float},
    'chlorides': {'min': 0.0, 'max': 17.0, 'type': float},
    'free sulfur dioxide': {'min': 0.0, 'max': 80.0, 'type': float},
    'total sulfur dioxide': {'min': 0.0, 'max': 300.0, 'type': float},
    'density': {'min': 0.8, 'max': 1.1, 'type': float},
    'pH': {'min': 1.0, 'max': 10.0, 'type': float},
    'sulphates': {'min': 0.0, 'max': 2.0, 'type': float},
    'alcohol': {'min': 7.0, 'max': 17.0, 'type': float},
    'quality': {'min': 1, 'max': 10, 'type': int},
    }
    return schema


# Download the data
@pytest.fixture
def input_data():
    csv_url =\
    'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
    data = pd.read_csv(csv_url, sep=';')
    return data

Write the following tests:
- is the number of columns in the data frame the same as in schema definition?
- are the values within defined ranges?
- are the types of the columns correct?

In [6]:
%%ipytest

def test_number_of_columns(input_data, input_schema):
    
    assert len(input_data.columns) == len(input_schema.keys())


def test_input_data_ranges(input_data, input_schema):
    
    for column in input_data.columns:
        min_val = input_data.loc[:, column].min()
        max_val = input_data.loc[:, column].max()

        min_schema = input_schema[column]['min']
        max_schema = input_schema[column]['max']
        
        assert min_val >= min_schema
        assert max_val <= max_schema
        
        
def test_input_types(input_data, input_schema):
    
    for column in input_data.columns:
        input_type = input_data.dtypes[column]
        schema_type = input_schema[column]['type']
        
        assert input_type == schema_type
    

UsageError: Cell magic `%%ipytest` not found.


### Feature engineering tests

**NOTE:** Data transformaton should be done only on test dataset. You fit the transformer on the test dataset and then apply it on the train dataset. Since we are only illustrating the functioning of the unit testing, we will do it on the whole dataset.

In [2]:
from sklearn.preprocessing import StandardScaler
from numpy import mean, std

In [6]:
# Let's transform a column...

# define standard scaler
scaler = StandardScaler()
# transform data
scaled = scaler.fit_transform(data[['alcohol']])
print(scaled)

[[-0.96024611]
 [-0.58477711]
 [-0.58477711]
 ...
 [ 0.54162988]
 [-0.20930812]
 [ 0.54162988]]


In [7]:
# And check the stats...
print('mean:', mean(scaled))
print('std:', std(scaled))

mean: 1.0664806540489309e-16
std: 1.0


In [10]:
@pytest.fixture
def scaled_alcohol():
    csv_url =\
    'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
    data = pd.read_csv(csv_url, sep=';')
    
    # Define scaler
    scaler = StandardScaler()
    # Transform data
    scaled = scaler.fit_transform(data[['alcohol']])
    return scaled

In [11]:
%%ipytest
# Test: is mean around zero and std around one?

def test_scaled_mean_zero(scaled_alcohol):
    
    mean_val = mean(scaled_alcohol)
    std_val = std(scaled_alcohol)
    
    assert pytest.approx(mean_val) == 0.0
    assert pytest.approx(std_val) == 1.0


UsageError: Cell magic `%%ipytest` not found.


## Additional exercises:

- implement and Test MinMaxScaler
- test null on 'quality'

## Great Expectations

When you need to test your data automatically and have a lot of additional functionalities from the tool (eg. notifications at the end of the tests), you will use  library, dedicated to data testing. Currently, the most popular one is (Great Expectations)[https://greatexpectations.io/]. Below is a simple demo. 

In [13]:
import great_expectations as ge

First, you need to convert the pandas dataframe to Great Expectations dataframe.

In [15]:
ge_df = ge.from_pandas(data)

It still looks the same.

In [16]:
ge_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


Now, you can run different tests on the dataframe. The value of "success" tells you if the test failed (false) or succeeded (true). For each test you get many other information.

You can explore other expectations here: https://greatexpectations.io/expectations

In [17]:
ge_df.expect_column_to_exist("pH")

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": true,
  "result": {}
}

In [18]:
ge_df.expect_column_values_to_not_be_null('quality')

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": true,
  "result": {
    "element_count": 1599,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "partial_unexpected_list": []
  }
}

In [19]:
ge_df.expect_column_values_to_be_of_type('chlorides', 'str')

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": false,
  "result": {
    "observed_value": "float64"
  }
}