In [1]:
import json
import logging
import sys

import click
import pandas as pd

logger = logging.getLogger(__name__)
handler = logging.StreamHandler(sys.stdout)
logger.setLevel(logging.INFO)
logger.addHandler(handler)

#### 1. Download parameters for pipeline from configs_file

In [6]:
from typing import List, Optional
from dataclasses import dataclass, field
from marshmallow_dataclass import class_schema
import yaml

RANDOM_STATE = 42
TEST_SIZE = 0.3
N_ESTIMATORS = 100


@dataclass()
class DownloadingParams:
    """ Structure for train model parameters """
    file_link: str = field()
    output_folder: str = field(default="data/raw/")
    name: str = field(default="data.zip")
        
@dataclass
class SplittingParams:
    """ Structure contain parameters for splitting data """
    test_size: float = field(default=TEST_SIZE)
    random_state: int = field(default=RANDOM_STATE)
        

@dataclass
class FeatureParams:
    """ Structure contain categorical and numerical params in dataset"""
    cat_features: List[str]
    num_features: List[str]
    target: Optional[str]

@dataclass
class OutliersNulls:
    """ Structure contain parameters for preparing data """
    outliers: str = field(default= 'RestingBP')
    nulls: str = field(default= 'Cholesterol')
    target: str = field(default='HeartDisease')
        
MEAN_SAMPLES_LEAF = 5
CRITERION = "gini"

@dataclass()
class ModelType:
    """ Structure for train model parameters """
    model_type: str = field(default='RandomForestClassifier')
    n_estimators: int = field(default=N_ESTIMATORS)
    min_samples_leaf: int = field(default=MEAN_SAMPLES_LEAF)
    criterion: str = field(default=CRITERION)
    random_state: int = field(default=RANDOM_STATE)


@dataclass
class CustomTransformer:
    """ Structure contain switch for custom transformer """
    use_custom_transformer: bool

In [7]:
@dataclass
class TrainingPipelineParams:
    """Structure for pipeline parameters"""
    input_data_path: str
    metric_path: str
    save_model_path: str
    save_transformer_path: str
    downloading_params: DownloadingParams
    model_type: ModelType
    clean_features: OutliersNulls
    custom_transformer: CustomTransformer
    feature_params: FeatureParams
    splitting_params: SplittingParams


TrainingPipelineParamsSchema = class_schema(TrainingPipelineParams)


def read_training_pipeline_params(path: str) -> TrainingPipelineParams:
    with open(path, "r") as input_stream:
        schema = TrainingPipelineParamsSchema()
        return schema.load(yaml.safe_load(input_stream))

In [8]:
config_path = "../../configs/train_config_random_forest.yaml"
training_pipeline_params: TrainingPipelineParams = read_training_pipeline_params(config_path)

training_pipeline_params

TrainingPipelineParams(input_data_path='data/raw/heart.csv', metric_path='models/metrics_train_random_forest.json', save_model_path='models/random_forest.pkl', save_transformer_path='models/transformer_random_forest.pkl', downloading_params=DownloadingParams(file_link='https://drive.google.com/file/d/1FU8p1PG7O_nGKvpjFXuXfoWDPCBvm8xS/view?usp=sharing', output_folder='data/raw/', name='data.zip'), model_type=ModelType(model_type='RandomForestClassifier', n_estimators=200, min_samples_leaf=5, criterion='gini', random_state=42), clean_features=OutliersNulls(outliers='RestingBP', nulls='Cholesterol', target='HeartDisease'), custom_transformer=CustomTransformer(use_custom_transformer=False), feature_params=FeatureParams(cat_features=['Sex', 'ChestPainType', 'FastingBS', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], num_features=['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak'], target='HeartDisease'), splitting_params=SplittingParams(test_size=0.3, random_state=0))

In [9]:
training_pipeline_params.downloading_params

DownloadingParams(file_link='https://drive.google.com/file/d/1FU8p1PG7O_nGKvpjFXuXfoWDPCBvm8xS/view?usp=sharing', output_folder='data/raw/', name='data.zip')

In [10]:
import gdown

In [11]:
def download_data(url: str, output_file_path: str):
    logger.info("Loading dataset... ")
    try:
        gdown.download(url=url, output=output_file_path, fuzzy=True, quiet=True)
        logger.info("Dataset was downloaded")
    except ConnectionError:
        logger.info("ConnectionError: you should have link to the internet!")

url = training_pipeline_params.downloading_params.file_link
output = "../../" + training_pipeline_params.downloading_params.output_folder + training_pipeline_params.downloading_params.name
download_data(url, output)

Loading dataset... 
Dataset was downloaded


In [12]:
training_pipeline_params.downloading_params.output_folder

'data/raw/'

In [13]:
import zipfile
def unzip_downloaded_data(path_to_zip_file: str):
    with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
        zip_ref.extractall(path_to_zip_file)

In [14]:
def read_raw_data(path: str) -> pd.DataFrame:
    """Read data from csv file"""
    logger.info('Loading dataset from %s...', path)
    df = pd.read_csv(path)
    logger.info('Loading from %s finished', path)
    logger.info('Data shape %s', df.shape)
    return df
#                                 training_pipeline_params.input_data_path
df: pd.DataFrame = read_raw_data('../../' + training_pipeline_params.input_data_path) 

Loading dataset from ../../data/raw/heart.csv...
Loading from ../../data/raw/heart.csv finished
Data shape (918, 12)


In [15]:
from typing import Tuple, Optional
def prepare_data(df: pd.DataFrame, params) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
    """Prepare data"""
    logger.info('Preparing dataset...')
    logger.info('Outliers handling...')
    outliers_field = params.outliers
    row = df[df[outliers_field] == 0].index
    df = df.drop(df.index[row])

    logger.info('Nulls handling...')
    nulls_field = params.nulls
    median_values = df[nulls_field].median()
    row = df[df[nulls_field] == 0].index
    df.loc[row, nulls_field] = median_values

    target_name = params.target
    if target_name in df.columns:
        logger.info('Extract and drop target feature...')
        target = df[target_name]
        df = df.drop([target_name], axis=1)
        return df, target
    else:
        return df, None

In [19]:
training_pipeline_params.clean_features

OutliersNulls(outliers='RestingBP', nulls='Cholesterol', target='HeartDisease')

In [16]:
df, target = prepare_data(df, training_pipeline_params.clean_features)

Preparing dataset...
Outliers handling...
Nulls handling...
Extract and drop target feature...


In [25]:
training_pipeline_params.splitting_params

SplittingParams(test_size=0.3, random_state=0)

In [30]:
from sklearn.model_selection import train_test_split
def split_data(df: pd.DataFrame,
               target: pd.Series,
               params) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    """Split data to train and test"""
    logger.info('Splitting data into train and test...')
    (X_train, X_test, y_train, y_test) = train_test_split(df,
                                                          target,
                                                          test_size=params.test_size,
                                                          random_state=params.random_state,
                                                          stratify=target)
    return X_train, X_test, y_train, y_test

In [31]:
df_train, df_test, train_target, test_target = split_data(df, target,
                                                          training_pipeline_params.splitting_params)

Splitting data into train and test...


In [32]:
training_pipeline_params.feature_params

FeatureParams(cat_features=['Sex', 'ChestPainType', 'FastingBS', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], num_features=['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak'], target='HeartDisease')

In [33]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

def build_feature_transformer(params):
    num_transformer = Pipeline(steps=[('min_max', MinMaxScaler())])
    cat_transformer = Pipeline(steps=[('one_hot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('numerical', num_transformer, params.num_features),
            ('categorical', cat_transformer, params.cat_features)])

    preprocessing = Pipeline(steps=[('preprocessor', preprocessor)])
    return preprocessing

In [45]:
def fit_transformer(transformer: ColumnTransformer, df_train: pd.DataFrame):
    """Fitting transformer with train data"""
    transformer.fit(df_train)
    return transformer

In [47]:
def get_features(transformer: ColumnTransformer, df: pd.DataFrame):
    """ Get transformed data """
    return transformer.transform(df)

In [46]:
transformer = build_feature_transformer(training_pipeline_params.feature_params)
transformer = fit_transformer(transformer, df_train)

In [48]:
train_features = get_features(transformer, df_train)

In [35]:
training_pipeline_params.feature_params

FeatureParams(cat_features=['Sex', 'ChestPainType', 'FastingBS', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], num_features=['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak'], target='HeartDisease')

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


from typing import Union
SklearnClassificationModel = Union[RandomForestClassifier, GradientBoostingClassifier]

def train_model(df: pd.DataFrame, target: pd.Series, train_params) -> SklearnClassificationModel:
    """Get trained model"""
    logger.info('Start loading %s model...', train_params.model_type)

    if train_params.model_type == 'RandomForestClassifier':
        model = RandomForestClassifier()
    
    elif train_params.model_type == 'GradientBoostingClassifier':
        model = GradientBoostingClassifier()
    
    else:
        logger.exception('Model is incorrect')
        raise NotImplementedError()

    logger.info('Finished loading model')
    logger.info('Start model fitting...')
    model.fit(df, target)
    logger.info('Finished model fitting')
    return model

In [38]:
training_pipeline_params.model_type

ModelType(model_type='RandomForestClassifier', n_estimators=200, min_samples_leaf=5, criterion='gini', random_state=42)

In [49]:
RandomForestClassifier().fit(train_features, train_target)

RandomForestClassifier()

In [51]:
model = train_model(train_features, train_target, training_pipeline_params.model_type )

Start loading RandomForestClassifier model...
Finished loading model
Start model fitting...
Finished model fitting


In [53]:
training_pipeline_params.metric_path

'models/metrics_train_random_forest.json'

In [54]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=200, min_samples_leaf=5, min_samples_split=10)

In [55]:
gbc.fit(train_features, train_target)

GradientBoostingClassifier(min_samples_leaf=5, min_samples_split=10,
                           n_estimators=200)

In [58]:
type(gbc).__name__

'GradientBoostingClassifier'

In [60]:
training_pipeline_params.save_model_path

'models/random_forest.pkl'