In [5]:
import sys
import pandas as pd
import logging
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from dataclasses import dataclass, field
from typing import List, Optional

logger = logging.getLogger(__name__)
handler = logging.StreamHandler(sys.stdout)
logger.setLevel(logging.INFO)
logger.addHandler(handler)

In [12]:

@dataclass()
class FeatureParams:
    categorical_features: List[str]
    numerical_features: List[str]
    target_col: str = field(default="target")


def build_numerical_pipeline() -> Pipeline:
    return Pipeline([("scaler", StandardScaler())])


def build_categorical_pipeline() -> Pipeline:
    return Pipeline([("ohe", OneHotEncoder())])


def build_transformer(params: FeatureParams) -> ColumnTransformer:
    return ColumnTransformer(
        [
            (
                "categorical_pipeline",
                build_categorical_pipeline(),
                params.categorical_features
            ),
            (
                "numerical_pipeline",
                build_numerical_pipeline(),
                params.numerical_features
            )
        ]
    )


def process_categorical_features(df: pd.DataFrame) -> pd.DataFrame:
    categorical_pipeline = build_categorical_pipeline()
    df.fillna("nan", inplace=True)
    
    temp_df = pd.DataFrame(categorical_pipeline.fit_transform(df))
    logger.info("check temp_cat_df")
    logger.info(temp_df.head())
    
    return pd.DataFrame(categorical_pipeline.fit_transform(df).toarray())


def process_numerical_features(df: pd.DataFrame) -> pd.DataFrame:
    numerical_pipeline = build_numerical_pipeline()
    df.fillna(0, inplace=True)
    
    temp_df = pd.DataFrame(numerical_pipeline.fit_transform(df))
    logger.info("check temp_num_df")
    logger.info(temp_df.head())
    
    return pd.DataFrame(numerical_pipeline.fit_transform(df))


def process_features(transformer: ColumnTransformer, df: pd.DataFrame) -> pd.DataFrame:
    return pd.DataFrame(transformer.transform(df))


def extract_target(df: pd.DataFrame, params: FeatureParams) -> pd.Series:
    return df[params.target_col]

In [15]:
import pandas as pd
from dataclasses import dataclass, field
from marshmallow_dataclass import class_schema
import yaml
import logging
import sys
from typing import List, Optional

#from build_features import build_transformer, extract_target, process_features

logger = logging.getLogger(__name__)
handler = logging.StreamHandler(sys.stdout)
logger.setLevel(logging.INFO)
logger.addHandler(handler)


@dataclass()
class SplittingParams:
    val_size: float = field(default=0.2)
    random_state: int = field(default=42)


@dataclass()
class FeatureParams:
    categorical_features: List[str]
    numerical_features: List[str]
    target_col: str = field(default="target")


@dataclass()
class TrainingParams:
    model_type: str = field(default="RandomForestClassifier")
    random_state: int = field(default=42)
    # RF params
    max_depth: int = field(default=5)
    # LR params
    solver: str = field(default="lbfgs")
    C: float = field(default=1.0)


@dataclass()
class TrainingPipelineParams:
    output_data_featurized_path: str
    output_data_train_path: str
    output_data_test_path: str
    output_target_train_path: str
    output_target_test_path: str
    output_model_path: str
    output_transformer_path: str
    metric_path: str
    splitting_params: SplittingParams
    feature_params: FeatureParams
    train_params: TrainingParams
    input_data_path: str = field(default="data/wines_SPA.csv")


TrainingPipelineParamsSchema = class_schema(TrainingPipelineParams)


def read_training_pipeline_params(path: str) -> TrainingPipelineParams:
    with open(path, "r") as input_stream:
        config_dict = yaml.safe_load(input_stream)
        schema = TrainingPipelineParamsSchema().load(config_dict)
        logger.info(f"Check schema: {schema}")
        return schema


def process_features_targets(params: TrainingPipelineParams) -> (pd.DataFrame, pd.DataFrame):
    df = pd.read_csv(params.input_data_path)

    logger.info("input df")
    logger.info(df.head())
    
    transformer = build_transformer(params.feature_params)
    transformer.fit(df)

    train_features = process_features(transformer, df)
    train_target = extract_target(df, params.feature_params)

    return train_features, train_target

In [16]:
config_path = "../configs/train_config_notebook.yaml"

training_pipeline_params = read_training_pipeline_params(config_path)

features, target = process_features_targets(
    training_pipeline_params
)

logger.info(features.head())
logger.info(target.head())
features.to_csv('../data/data_featurized.csv', index=False)

Check schema: TrainingPipelineParams(output_data_featurized_path='../data/data_featurized.csv', output_data_train_path='../data/train.csv', output_data_test_path='../data/test.csv', output_target_train_path='../data/train_target.csv', output_target_test_path='../data/test_target.csv', output_model_path='../models/model.pkl', output_transformer_path='../models/transformer.pkl', metric_path='../models/metrics.json', splitting_params=SplittingParams(val_size=0.2, random_state=42), feature_params=FeatureParams(categorical_features=['winery', 'wine', 'year', 'region', 'type'], numerical_features=['rating', 'body', 'acidity'], target_col='price'), train_params=TrainingParams(model_type='RandomForestClassifier', random_state=42, max_depth=10, solver='lbfgs', C=1.0), input_data_path='../data/wines_SPA.csv')
Check schema: TrainingPipelineParams(output_data_featurized_path='../data/data_featurized.csv', output_data_train_path='../data/train.csv', output_data_test_path='../data/test.csv', output_

          winery           wine  year  rating  num_reviews country  \
0  Teso La Monja          Tinto  2013     4.9           58  Espana   
1         Artadi  Vina El Pison  2018     4.9           31  Espana   
2   Vega Sicilia          Unico  2009     4.8         1793  Espana   
3   Vega Sicilia          Unico  1999     4.8         1705  Espana   
4   Vega Sicilia          Unico  1996     4.8         1309  Espana   

             region   price                  type  body  acidity  
0              Toro  995.00              Toro Red   5.0      3.0  
1    Vino de Espana  313.50           Tempranillo   4.0      2.0  
2  Ribera del Duero  324.95  Ribera Del Duero Red   5.0      3.0  
3  Ribera del Duero  692.96  Ribera Del Duero Red   5.0      3.0  
4  Ribera del Duero  778.06  Ribera Del Duero Red   5.0      3.0  
          winery           wine  year  rating  num_reviews country  \
0  Teso La Monja          Tinto  2013     4.9           58  Espana   
1         Artadi  Vina El Pison  2018