In [1]:
# For virtual environment: uncomment the lines below
# %cd ..
# %env PYTHONPATH=.

# For Docker
%cd /home/tutorial-predict-device-change

/home/tutorial-predict-device-change


# Import

In [2]:
import argparse
import numpy 
import pandas as pd
import re
from typing import Text
import yaml

# import testing tools 
import pytest

In [3]:
config = yaml.safe_load(open('params.yaml'))
config

{'Base': {'random_state': 42, 'log_level': 'DEBUG'},
 'Data': {'user_features_raw': './data/raw/user_features.feather',
  'target_raw': './data/raw/target.feather',
  'scoring_user_features_raw': './data/raw/scoring_user_features.feather',
  'scoring_target_raw': './data/raw/scoring_target.feather'},
 'Features': {'features_path': './data/processed/features.feather',
  'scoring_features_path': './data/processed/scoring_features.feather',
  'predicted_target_path': './data/processed/predicted_target.feather'},
 'Train': {'top_K_coef': 0.05,
  'model_path': './models/model.joblib',
  'train_metrics': './reports/train_metrics.json',
  'model_params': {'iterations': 10,
   'thread_count': 50,
   'has_time': True,
   'allow_writing_files': False,
   'cat_features': ['feature_17',
    'feature_21',
    'feature_11',
    'feature_11',
    'feature_16',
    'feature_22']}}}

# Check your code works 


## Manual / eye sanity checks

In [4]:
# Say Hello to all tests in the World 

list_of_tests = ['manual', 'print', 'unit', 'integration']

print('Hello {}!'.format(', '.join(list_of_tests)))

Hello manual, print, unit, integration!


## Defensive programming: assertions 

In [5]:
# Assertion example

def hello_tests(list_of_tests):
    assert len(list_of_tests) > 0, 'Test list is empty'
    print('Hello {}!'.format(', '.join(list_of_tests)))

    
hello_tests(['manual', 'print', 'unit', 'integration'])

Hello manual, print, unit, integration!


In [6]:
hello_tests([])

AssertionError: Test list is empty

## Test example

In [7]:
# content of test_sample.py
def inc(x):
    return x + 1


def test_answer():
    assert inc(3) == 4
    assert inc(3) != 5

In [8]:
test_answer()

# Tools review

## Working with Pandas DataFrames

In [9]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'test': ['unit', 'print', 'unit', 'integration'],
                   'count': [50, 30, 10, 3],
                   'times': [100, 2050, 2050, 3232]})
df

Unnamed: 0,test,count,times
0,unit,50,100
1,print,30,2050
2,unit,10,2050
3,integration,3,3232


### Checking for duplicates and missing values

In [15]:
# Checking missing values

assert df.notnull().all().all()
assert ~df.isnull().any().any()
assert df.isnull().sum().sum() == 0

In [14]:
# Checking for duplicates

assert ~df.duplicated().any()

In [15]:
if df.duplicated(subset=['test']).any():
    raise ValueError('Duplicate records found for test name')

ValueError: Duplicate records found for test name

### Pandas built in testing utilities 

- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.testing.assert_frame_equal.html 

In [18]:
from pandas.testing import assert_frame_equal
from pandas.testing import assert_index_equal
from pandas.testing import assert_series_equal

In [16]:
from pandas._testing import assert_frame_equal

df1 = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
df2 = pd.DataFrame({'a': [1, 2], 'b': [3.0, 4.0]})

In [17]:
# df1 equals itself

assert_frame_equal(df1, df1)

In [18]:
# df1 differs from df2 as column ‘b’ is of a different type.

assert_frame_equal(df1, df2)

AssertionError: Attributes of DataFrame.iloc[:, 1] (column name="b") are different

Attribute "dtype" are different
[left]:  int64
[right]: float64

In [19]:
# Ignore differing dtypes in columns with check_dtype.

assert_frame_equal(df1, df2, check_dtype=False)

## Pytest

Features:

- All tests are pre-fixed with test_
- All files containing tests are prefixed with test_


https://docs.pytest.org/en/stable/index.html

Move followint code to `tests/test_sample.py` & run 'pytest' command in terminal

```python
def inc(x):
    return x + 1


def test_answer():
    assert inc(3) == 5
```

Run tests in terminal: 
    
```bash 
pytest
```

## Hypothesis

With Hypothesis https://hypothesis.readthedocs.io/en/latest/

In [25]:
# Hypothesis - Automatic data generation for property based testing
from hypothesis import strategies as st

In [52]:
print('Examples of integers:')
print(st.integers().example())
print(st.integers().example())
print(st.integers().example())

Examples of integers:
0
0
0


In [40]:
st.text().example()

'0'

**Create Hypothesis test example**

- Create file: notebook/tests/demo_hypothesis.py 
- Add code:
demo_hypothesis.py 

```python
from hypothesis import given
from hypothesis import strategies as st


def backwards_allcaps(text):
    return text[::-1].upper()


@given(st.text())
def test_backwards_allcaps(input_string):
    modified = backwards_allcaps(input_string)
    assert input_string.upper() == ''.join(reversed(modified))
```

Example source: https://github.com/jesford/testing-in-data-science/blob/master/intro-to-testing-presentation.ipynb 

In [20]:
# By defualt `pytest` looking for files with 'test_' prefix only

!pytest

platform linux -- Python 3.8.2, pytest-6.2.2, py-1.10.0, pluggy-0.13.1
rootdir: /home/tutorial-predict-device-change
plugins: hypothesis-6.0.3
[1mcollecting ... [0m[1mcollected 0 items                                                              [0m



In [29]:
# Specify test file name to run arbitrary test file 

!pytest tests/test_demo_hypothesis.py

platform linux -- Python 3.8.5, pytest-6.2.2, py-1.10.0, pluggy-0.13.1
rootdir: /home/alex/Dev/Projects/tutorials/mlops/tutorial-predict-device-change
plugins: hypothesis-6.0.3
collected 1 item                                                               [0m

tests/test_demo_hypothesis.py [32m.[0m[32m                                          [100%][0m



In [30]:
# Use `--hypothesis-show-statistics` to show details 

!pytest tests/test_demo_hypothesis.py --hypothesis-show-statistics

platform linux -- Python 3.8.5, pytest-6.2.2, py-1.10.0, pluggy-0.13.1
rootdir: /home/alex/Dev/Projects/tutorials/mlops/tutorial-predict-device-change
plugins: hypothesis-6.0.3
collected 1 item                                                               [0m

tests/test_demo_hypothesis.py [32m.[0m[32m                                          [100%][0m

tests/test_demo_hypothesis.py::test_backwards_allcaps:

  - during reuse phase (0.01 seconds):
    - Typical runtimes: ~ 1ms, ~ 52% in data generation
    - 2 passing examples, 0 failing examples, 0 invalid examples

  - during generate phase (0.50 seconds):
    - Typical runtimes: 1-5 ms, ~ 73% in data generation
    - 98 passing examples, 0 failing examples, 2 invalid examples

  - Stopped because settings.max_examples=100




## Hypothesis + Pandas

In [67]:
from hypothesis import strategies as st
from hypothesis.extra.pandas import data_frames, column, range_indexes, series


# Generate sample scoring results 

data_frames([column('user_id',
                    elements=st.integers(min_value=0, max_value=100_000),
                    dtype=int, unique=True),
             column('prob_score',
                    elements=st.floats(min_value=0, max_value=1),
                    unique=False
                   )
        ]).example()

Unnamed: 0,user_id,prob_score


In [68]:
# Example for features sample

data_frames(index=range_indexes(min_size=5, max_size=10),
            columns = [
                column('user_id', 
                    elements=st.integers(min_value=1, max_value=100000), 
                    dtype=int, 
                    unique=True),
                 column('month', 
                        elements=st.datetimes(
                            min_value=pd.Timestamp(2020, 4, 30),
                            max_value=pd.Timestamp(2020, 8, 31)),
                        unique=True),
                 column('feature_21',
                        elements=st.text(), 
                        unique=True)
            ]).example()


Unnamed: 0,user_id,month,feature_21
0,93661,2020-05-15 05:50:20.878708,õeÐ󔞱񠼋򬍕v
1,3520,2020-07-13 19:19:54.479847,
2,21759,2020-04-30 04:24:34.385906,©
3,17610,2020-04-30 14:21:41.809512,DîX
4,28645,2020-05-04 04:31:44.412731,"dá""Ø0Ì§¯󏨜¥󪺂"
5,13931,2020-08-02 11:13:58.774214,
6,56874,2020-06-23 05:15:58.067385,\r
7,66924,2020-07-06 07:31:21.408287,KDô񩶱ÁgD¥ìåâÐÝ9򛕧÷8p󳓟j


# Unit Testing

## Load feature data

In [54]:
# Data for tutorial
target_csv = 'data/raw/target.feather'
user_features_raw = 'data/raw/user_features.feather'

# Features
categories = ['feature_17', 'feature_21', 'feature_11', 'feature_16', 'feature_22']  

num_features = ['feature_1', 'feature_2', 'feature_3', 'feature_4',
       'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9',
       'feature_10', 'feature_12', 'feature_13', 'feature_14', 'feature_15',
       'feature_18', 'feature_19', 'feature_20', 'feature_23', 'feature_24',
       'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29',
       'feature_30']

In [55]:
target_df = pd.read_feather(target_csv)
target_df.head(1)

Unnamed: 0,user_id,month,target
0,0,2020-04-30,1.0


In [56]:
user_features_df = pd.read_feather(user_features_raw)
user_features_df = user_features_df.loc[user_features_df.user_id.isin(target_df.user_id),]
user_features_df.head(1)

Unnamed: 0,user_id,month,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30
0,0,2020-04-30,0.993121,-15,2.274309,18,2868,-1.305588,-0.097643,0.617778,...,H4V75OQHSRBLA,AOKOISPPQLWGKK,-3.146305,-1.655508,25,3.032537,-29960,-1.86466,-0.212668,-3.245333


In [57]:
for col in categories:
    print(user_features_df[col].nunique())

42
58
43
8
44


## Testing Feature Engineering code

### New feature extraction feature

- что может пойти не так? 
- будет ли работать в production? 
- как протестировать?

In [58]:
def add_feature31(df: pd.DataFrame) -> pd.DataFrame:
    """Get first letter 'device code"""

    df['feature31'] = df.copy().feature_21.apply(lambda s: s[0])

    return df

df2 = add_feature31(user_features_df)
df2[['user_id', 'month', 'feature_21', 'feature31']].head()

Unnamed: 0,user_id,month,feature_21,feature31
0,0,2020-04-30,H4V75OQHSRBLA,H
1,0,2020-05-31,AUEOMIKY5CRWBNWO7S,A
2,0,2020-06-30,4VPOKVAQSMMTDZQ,4
3,0,2020-07-31,RAGXKIMJHFFGKA,R
4,0,2020-08-31,RAGXKIMJHFFGKA,R


In [59]:
df2.feature31.value_counts()

H    114859
R     72707
J     70910
A     56506
7     52357
D     52002
W     46512
N     46070
L     43986
M     34397
Q     27852
K     27698
6     23481
X     18273
O     14149
4     11053
E     10735
G      7666
V      7618
F      3746
Z      3683
3      2660
I      2530
U       663
C         8
B         4
5         2
T         1
Name: feature31, dtype: int64

In [51]:
# Test case 1: Null values in source data 

user_features_df.feature_21.cat.add_categories([''], inplace=True) # добавляем новую категорию 
user_features_df.feature_21.iloc[:4] = ''  # заменяем значение на '' для первых 4-х пользователей
user_features_df.head().T

Unnamed: 0,0,1,2,3,4
user_id,0,0,0,0,0
month,2020-04-30 00:00:00,2020-05-31 00:00:00,2020-06-30 00:00:00,2020-07-31 00:00:00,2020-08-31 00:00:00
feature_1,0.993121,-1.41691,0.673564,-2.12491,-2.12226
feature_2,-15,-145,23,-15,-50
feature_3,2.27431,-1.08789,0.0166665,1.23482,-0.584464
feature_4,18,-8,-7,18,-8
feature_5,2868,-1763,-4092,9348,1894
feature_6,-1.30559,-1.32201,-0.936663,0.233297,-0.0923148
feature_7,-0.0976427,3.14386,-1.90981,-0.108647,-0.54748
feature_8,0.617778,-0.272231,0.715618,-1.83146,-0.0653226


In [52]:
# Try to add features 

df2 = add_feature31(user_features_df)
df2.head()

IndexError: string index out of range

In [53]:
# Reproduce error 

s = ''
s[0]

IndexError: string index out of range

### Generate test data

In [61]:
# Extract sample data from dataset 

test_data = user_features_df[['user_id', 'month', 'feature_21']][:5].to_dict(orient='list')
test_data

{'user_id': [0, 0, 0, 0, 0],
 'month': [Timestamp('2020-04-30 00:00:00'),
  Timestamp('2020-05-31 00:00:00'),
  Timestamp('2020-06-30 00:00:00'),
  Timestamp('2020-07-31 00:00:00'),
  Timestamp('2020-08-31 00:00:00')],
 'feature_21': ['H4V75OQHSRBLA',
  'AUEOMIKY5CRWBNWO7S',
  '4VPOKVAQSMMTDZQ',
  'RAGXKIMJHFFGKA',
  'RAGXKIMJHFFGKA']}

In [62]:
# Generate test data

test_data = {
    'user_id': [0, 0, 0, 0, 0],
    'month': [
        pd.Timestamp('2020-04-30 00:00:00'),
        pd.Timestamp('2020-05-31 00:00:00'),
        pd.Timestamp('2020-06-30 00:00:00'),
        pd.Timestamp('2020-07-31 00:00:00'),
        pd.Timestamp('2020-08-31 00:00:00')],
     'feature_21': ['RAGXKIMJHFFGKA', '2322341', '!wersrqqw', None, '']
}

test_df = pd.DataFrame(test_data)
test_df.head()

Unnamed: 0,user_id,month,feature_21
0,0,2020-04-30,RAGXKIMJHFFGKA
1,0,2020-05-31,2322341
2,0,2020-06-30,!wersrqqw
3,0,2020-07-31,
4,0,2020-08-31,


### Update function

In [63]:
# Modify add_feature31()

def add_feature31(df): 
    """Add some first letter device code"""
    
    df['feature_31'] = df.feature_21.apply(lambda s: 'None' if s in ['', None] else re.findall(r'[\w]', s)[0])
    
    return df
 
add_feature31(test_df)

Unnamed: 0,user_id,month,feature_21,feature_31
0,0,2020-04-30,RAGXKIMJHFFGKA,R
1,0,2020-05-31,2322341,2
2,0,2020-06-30,!wersrqqw,w
3,0,2020-07-31,,
4,0,2020-08-31,,


### Add test function

In [64]:
def test_add_feature31(test_df):
    
    expected_data = {
        'user_id': [0, 0, 0, 0, 0],
        'month': [
            pd.Timestamp('2020-04-30 00:00:00'),
            pd.Timestamp('2020-05-31 00:00:00'),
            pd.Timestamp('2020-06-30 00:00:00'),
            pd.Timestamp('2020-07-31 00:00:00'),
            pd.Timestamp('2020-08-31 00:00:00')],
         'feature_21': ['RAGXKIMJHFFGKA', '2322341', '!wersrqqw', None, ''],
         'feature_31': ['R', '2', 'w', 'None', 'None']
    }
    
    expected_df = pd.DataFrame(expected_data)   
    calculated_df = add_feature31(test_df)

    assert calculated_df is not None 
    
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.testing.assert_frame_equal.html?highlight=assert 
    pd.testing.assert_frame_equal(
        left = calculated_df, 
        right = expected_df, 
        check_dtype=True, 
        check_index_type='equiv', 
        check_column_type='equiv', 
        check_names=True, 
        check_exact=True, 
#         check_datetimelike_compat=True, 
        check_categorical=True, 
    )
    
    
test_add_feature31(test_df) 

### Add new function to src/data/features.py and run feature calculation pipeline 
"""
- add function 
- add tests 

"""

In [58]:
from src.pipelines.featurize import featurize


featurize(CONFIG_PATH)

2021-02-25 14:01:01,195 — FEATURIZE — INFO — Load dataset
2021-02-25 14:01:01,906 — FEATURIZE — INFO — Process dataset
2021-02-25 14:01:02,546 — FEATURIZE — INFO — Add target column
2021-02-25 14:01:05,265 — FEATURIZE — INFO — Process nulls
2021-02-25 14:01:06,124 — FEATURIZE — INFO — Save features
2021-02-25 14:01:06,729 — FEATURIZE — DEBUG — Features path: data/processed/features.feather


In [65]:
df2.feature31.value_counts()

H    114859
R     72707
J     70910
A     56506
7     52357
D     52002
W     46512
N     46070
L     43986
M     34397
Q     27852
K     27698
6     23481
X     18273
O     14149
4     11053
E     10735
G      7666
V      7618
F      3746
Z      3683
3      2660
I      2530
U       663
C         8
B         4
5         2
T         1
Name: feature31, dtype: int64

## Testing Feature  data properties

- All features have same data type 
- Numeric features are scaled in range between 0 and 1
- No missing data 
- Missing data is replaced by some values
- Data distributions meet expectations
- No Outliers


In [71]:
# Generate sample scoring results 

df = data_frames(index=range_indexes(min_size=5, max_size=10),
                 columns = [column('user_id',
                    elements=st.integers(min_value=0, max_value=100_000),
                    dtype=int, unique=True),
                 column('prob_score',
                        elements=st.floats(min_value=0, max_value=1),
                        unique=True
                       )
                ]).example()


df.head()

Unnamed: 0,user_id,prob_score
0,3291,0.979206
1,84617,0.345345
2,74917,0.581338
3,87919,0.594053
4,74592,0.033683


In [72]:
df.shape

(9, 2)

### Test values range 

In [73]:
def test_prob_score_range(df):
    
    # test score values range in [0, 1]
    assert df.prob_score.between(0, 1, inclusive=True).all() 

In [74]:
test_prob_score_range(df)

### Test data type

In [75]:
def test_prob_score_dtype(df):
    # test dtype is float
    
    # most robust and native way to achieve dtype recognition
    assert pd.api.types.is_float_dtype(df.prob_score) 
    
    # test score dtype for a sigle value
    assert isinstance(df.prob_score[0], float)

test_prob_score_dtype(df)

### Test no missing & duplicates 

In [76]:
def test_prob_score_na(df):
    # Test no missing & duplicates
    
    assert df.notnull().all().all()
    assert ~df.duplicated().any()

test_prob_score_na(df)

## Test Inputs 



### Generate data schema 

Best practice is to use a schema. 
A schema is a specification of rules or data properties expected for a set of fields. 

Example schema for Iris dataset: 
```json
iris_schema = {
    'sepal length': {
        'range': {
            'min': 4.0, 
            'max': 8.0
        },
        'dtype': float,
    },
    'sepal width': {
        'range': {
            'min': 1.0,
            'max': 5.0
        },
        'dtype': float,
    },
    'petal length': {
        'range': {
            'min': 1.0,
            'max': 7.0
        },
        'dtype': float,
    },
    'petal width': {
        'range': {
            'min': 0.1,
            'max': 3.0
        },
        'dtype': float,
    }
}
```

In [77]:
import pprint

def gen_category_schema(df, categories, num_features):
    """Generate data schema for category and numeric features"""
    
    schema = {}
    
    cat = df.loc[:, categories].describe()
    cat.loc['unique', :] = df.loc[:, categories].apply(lambda x: x.unique().tolist())
    cat.loc['nunique', :] = df.loc[:, categories].apply(lambda x: x.nunique())
    cat.loc['dtype', :] = 'category'
    schema.update(cat.T.to_dict('index'))
    
    num_df = df.copy().loc[:, num_features]
    num = num_df.describe()
    num = num.append(pd.DataFrame(
        {'dtype': [t.__str__() for t in num_df.dtypes.values]}, 
        index = num_features).T)
    schema.update(num.T.to_dict('index'))
    
    return schema
    
    
dschema = gen_category_schema(user_features_df, categories, num_features)
pprint.pprint(dschema)

{'feature_1': {'25%': -1.1430581298920903,
               '50%': 0.03319521301712278,
               '75%': 1.2199566293957183,
               'count': 752128.0,
               'dtype': 'float64',
               'max': 8.71482774873068,
               'mean': 0.04383947463088342,
               'min': -8.908541643807217,
               'std': 1.774547872522771},
 'feature_10': {'25%': -1.0304815357099264,
                '50%': 0.18618795439381358,
                '75%': 1.4703034668456332,
                'count': 752128.0,
                'dtype': 'float64',
                'max': 10.148381364385477,
                'mean': 0.23737338661451513,
                'min': -9.170694699579633,
                'std': 1.9071496456722656},
 'feature_11': {'count': 752128,
                'dtype': 'category',
                'freq': 69896,
                'nunique': 43,
                'top': 'MHQITDLH6CZQ',
                'unique': ['3NWLPIR2TFFUM62',
                           'JODPUBMS7GRNB

In [66]:
s  = set(df.columns) - set(['count'])
s

{'prob_score', 'user_id'}

In [67]:
for i in s: print(i)

user_id
prob_score


### Test Input data types

In [68]:
def test_input_data_types(df, dschema):

    for feature  in dschema.keys():        
        assert df[feature].dtype.__str__() == dschema[feature]['dtype'], f'Data type test failed for {feature}'
            
test_input_data_types(user_features_df, dschema)

- Break test example

In [69]:
# Create new DF
user_features_df2 = user_features_df.copy()

# Change dtype for 'feature_1'
user_features_df2['feature_1'] = user_features_df2['feature_1'].astype('str')

# Run test
test_input_data_types(user_features_df2, dschema)

AssertionError: Data type test failed for feature_1

### Test numeric data ranges (min, max)

In [70]:
def test_numeric_data_ranges(df, dschema, num_features):

    for feature in num_features:
        # use assertions to ensure the max/min values found in the dataset
        assert df[feature].max() <= dschema[feature]['max'], f'Max value test failed for {feature}'
        assert df[feature].min() >= dschema[feature]['min'], f'Min value test failed for {feature}'

test_numeric_data_ranges(user_features_df, dschema, num_features)

- Try to break test

In [71]:
# Create new DF
user_features_df2 = user_features_df.copy()

# Check max value
user_features_df2['feature_2'].max()
print(f"Old max value: {user_features_df2['feature_2'].max()}")

# Simulate max value change
user_features_df2.loc[:0, 'feature_2'] = user_features_df2['feature_2'].max() * 10
print(f"New max value: {user_features_df2['feature_2'].max()}")

Old max value: 450
New max value: 4500


In [72]:
# Run test for new data

test_numeric_data_ranges(user_features_df2, dschema, num_features)

AssertionError: Max value test failed for feature_2

### Test category data values

In [73]:
def test_category_data_values(df, dschema, categories):

    for feature in categories:
        
        set_a = df.loc[:, feature].unique().tolist()

        # use assertions to ensure the feature categories exist in schema
        cat_dif = list(set(set_a) - set(dschema[feature]['unique']))
        assert len(cat_dif) == 0, f'DF has categories not in shema: {cat_dif}'

test_category_data_values(user_features_df, dschema, categories)

- Break test example

In [74]:
# Add a new category 'Unknown'
feature_17_categories = user_features_df2['feature_17'].tolist()
feature_17_categories[0] = 'Unknown'
user_features_df2['feature_17'] = pd.Categorical(
    user_features_df2['feature_17'], categories=set(feature_17_categories)
)
user_features_df2['feature_17'] = feature_17_categories
user_features_df2['feature_17'].head()


0             Unknown
1    OUH6V7W7UIPZ2AZI
2    OUH6V7W7UIPZ2AZI
3        E62S2GPTI3CU
4        E62S2GPTI3CU
Name: feature_17, dtype: object

In [75]:
# Run test 

test_category_data_values(user_features_df2, dschema, categories)

AssertionError: DF has categories not in shema: ['Unknown']

## Testing Pipeline Config 

In [87]:
config = yaml.safe_load(open('params.yaml'))
pprint.pprint(config)

{'Base': {'log_level': 'DEBUG', 'random_state': 42},
 'Data': {'scoring_target_raw': './data/raw/scoring_target.feather',
          'scoring_user_features_raw': './data/raw/scoring_user_features.feather',
          'target_raw': './data/raw/target.feather',
          'user_features_raw': './data/raw/user_features.feather'},
 'Features': {'features_path': './data/processed/features.feather',
              'predicted_target_path': './data/processed/predicted_target.feather',
              'scoring_features_path': './data/processed/scoring_features.feather'},
 'Train': {'model_params': {'allow_writing_files': False,
                            'cat_features': ['feature_17',
                                             'feature_21',
                                             'feature_11',
                                             'feature_11',
                                             'feature_16',
                                             'feature_22'],
                        

### Test required fields

In [88]:
def test_check_required_fields(config):
       
    assert 'base' in config.keys()
    assert 'data_load' in config.keys()
    assert 'target' in config['data_load'].keys()
    assert 'dataset' in config['data_load'].keys()
    assert 'train' in config.keys()
    
    

test_check_required_fields(config)

AssertionError: 

### Test config value types

In [89]:
def test_config_value_types(config):
    
    assert isinstance(config['base']['random_state'], int)
    assert isinstance(config['data_load']['target'], str)
    assert isinstance(config['data_load']['dataset'], str)
    assert isinstance(config['train']['model_path'], str)
    
test_config_value_types(config)

KeyError: 'base'

### Test Model params

In [90]:
from catboost import CatBoostClassifier 

def test_model_params(config):
    """Test the estimator can be created with params in config"""

    # Then
    # TODO: fix tests or fix params.yaml: now there is just one estimator in train section
    assert CatBoostClassifier(**config['train']['estimators']['catboost'])
    assert config['train']['estimators']['catboost']['loss_function'] == 'Logloss'
    
test_model_params(config)

KeyError: 'train'

# Run all tests

To run all tests 
```bash
pytest
```

To run a specific test  
```bash
pytest tests/test_config.py
```

In [84]:
!pytest

platform linux -- Python 3.8.5, pytest-6.2.2, py-1.10.0, pluggy-0.13.1
rootdir: /home/alex/Dev/Projects/tutorials/mlops/tutorial-predict-device-change
plugins: hypothesis-6.0.3
collected 11 items                                                             [0m[1m

tests/test_config.py [32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                           [ 81%][0m
tests/test_demo_hypothesis.py [32m.[0m[32m                                          [ 90%][0m
tests/data/test_features.py [32m.[0m[32m                                            [100%][0m

