In [5]:
import sys
import pandas as pd
import logging
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from dataclasses import dataclass, field
from typing import List, Optional

logger = logging.getLogger(__name__)
handler = logging.StreamHandler(sys.stdout)
logger.setLevel(logging.INFO)
logger.addHandler(handler)

In [13]:

@dataclass()
class FeatureParams:
    categorical_features: List[str]
    numerical_features: List[str]
    target_col: str = field(default="target")


def build_numerical_pipeline() -> Pipeline:
    return Pipeline([("scaler", StandardScaler())])


def build_categorical_pipeline() -> Pipeline:
    #return Pipeline([("ohe", OneHotEncoder())])
    return Pipeline([("ohe", LabelEncoder())])


def build_transformer(params: FeatureParams) -> ColumnTransformer:
    return ColumnTransformer(
        [
            (
                "categorical_pipeline",
                build_categorical_pipeline(),
                params.categorical_features
            ),
            (
                "numerical_pipeline",
                build_numerical_pipeline(),
                params.numerical_features
            )
        ]
    )


def process_categorical_features(df: pd.DataFrame) -> pd.DataFrame:
    categorical_pipeline = build_categorical_pipeline()
    df.fillna("nan", inplace=True)
    
    temp_df = pd.DataFrame(categorical_pipeline.fit_transform(df))
    logger.info("check temp_cat_df")
    logger.info(temp_df.head())
    
    return pd.DataFrame(categorical_pipeline.fit_transform(df).toarray())


def process_numerical_features(df: pd.DataFrame) -> pd.DataFrame:
    numerical_pipeline = build_numerical_pipeline()
    df.fillna(0, inplace=True)
    
    temp_df = pd.DataFrame(numerical_pipeline.fit_transform(df))
    logger.info("check temp_num_df")
    logger.info(temp_df.head())
    
    return pd.DataFrame(numerical_pipeline.fit_transform(df))


def process_features(transformer: ColumnTransformer, df: pd.DataFrame) -> pd.DataFrame:
    return pd.DataFrame(transformer.transform(df))


def extract_target(df: pd.DataFrame, params: FeatureParams) -> pd.Series:
    return df[params.target_col]

In [16]:
import pandas as pd
from dataclasses import dataclass, field
from marshmallow_dataclass import class_schema
import yaml
import logging
import sys
from typing import List, Optional

#from build_features import build_transformer, extract_target, process_features

logger = logging.getLogger(__name__)
handler = logging.StreamHandler(sys.stdout)
logger.setLevel(logging.INFO)
logger.addHandler(handler)


@dataclass()
class SplittingParams:
    val_size: float = field(default=0.2)
    random_state: int = field(default=42)


@dataclass()
class FeatureParams:
    categorical_features: List[str]
    numerical_features: List[str]
    target_col: str = field(default="target")


@dataclass()
class TrainingParams:
    model_type: str = field(default="RandomForestClassifier")
    random_state: int = field(default=42)
    # RF params
    max_depth: int = field(default=5)
    # LR params
    solver: str = field(default="lbfgs")
    C: float = field(default=1.0)


@dataclass()
class TrainingPipelineParams:
    output_data_featurized_path: str
    output_data_train_path: str
    output_data_test_path: str
    output_target_train_path: str
    output_target_test_path: str
    output_model_path: str
    output_transformer_path: str
    metric_path: str
    splitting_params: SplittingParams
    feature_params: FeatureParams
    train_params: TrainingParams
    input_data_path: str = field(default="data/wines_SPA.csv")


TrainingPipelineParamsSchema = class_schema(TrainingPipelineParams)


def read_training_pipeline_params(path: str) -> TrainingPipelineParams:
    with open(path, "r") as input_stream:
        config_dict = yaml.safe_load(input_stream)
        schema = TrainingPipelineParamsSchema().load(config_dict)
        logger.info(f"Check schema: {schema}")
        return schema


def process_features_targets(params: TrainingPipelineParams) -> (pd.DataFrame, pd.DataFrame):
    df = pd.read_csv(params.input_data_path)
    df.dropna(inplace=True)

    print('Categorical columns: ')
    for col in df.columns:
        if df[col].dtype == 'object':
            print(str(col))
            label = LabelEncoder()
            label.fit(df[col])
            df[col] = label.transform(df[col].astype(str))
    
    logger.info("input df")
    logger.info(df.head())
    
    transformer = build_transformer(params.feature_params)
    transformer.fit(df)

    train_features = process_features(transformer, df)
    train_target = extract_target(df, params.feature_params)

    return train_features, train_target

In [17]:
config_path = "../configs/train_config_notebook.yaml"

training_pipeline_params = read_training_pipeline_params(config_path)

features, target = process_features_targets(
    training_pipeline_params
)

logger.info(features.head())
logger.info(target.head())
features.to_csv('../data/data_featurized.csv', index=False)

Check schema: TrainingPipelineParams(output_data_featurized_path='../data/data_featurized.csv', output_data_train_path='../data/train.csv', output_data_test_path='../data/test.csv', output_target_train_path='../data/train_target.csv', output_target_test_path='../data/test_target.csv', output_model_path='../models/model.pkl', output_transformer_path='../models/transformer.pkl', metric_path='../models/metrics.json', splitting_params=SplittingParams(val_size=0.2, random_state=42), feature_params=FeatureParams(categorical_features=['winery', 'wine', 'year', 'region', 'type'], numerical_features=['rating', 'body', 'acidity'], target_col='price'), train_params=TrainingParams(model_type='RandomForestClassifier', random_state=42, max_depth=10, solver='lbfgs', C=1.0), input_data_path='../data/wines_SPA.csv')
Check schema: TrainingPipelineParams(output_data_featurized_path='../data/data_featurized.csv', output_data_train_path='../data/train.csv', output_data_test_path='../data/test.csv', output_

   winery  wine  year  rating  num_reviews  country  region   price  type  \
0     371   667    61     4.9           58        0      61  995.00    19   
1      30   714    66     4.9           31        0      64  313.50    18   
2     395   683    57     4.8         1793        0      49  324.95    11   
3     395   683    47     4.8         1705        0      49  692.96    11   
4     395   683    44     4.8         1309        0      49  778.06    11   

   body  acidity  
0   5.0      3.0  
1   4.0      2.0  
2   5.0      3.0  
3   5.0      3.0  
4   5.0      3.0  
   winery  wine  year  rating  num_reviews  country  region   price  type  \
0     371   667    61     4.9           58        0      61  995.00    19   
1      30   714    66     4.9           31        0      64  313.50    18   
2     395   683    57     4.8         1793        0      49  324.95    11   
3     395   683    47     4.8         1705        0      49  692.96    11   
4     395   683    44     4.8         

TypeError: fit_transform() takes 2 positional arguments but 3 were given

### Transform pipeline

In [19]:
cat_cols = ['winery', 'wine', 'year', 'region', 'type']
num_cols = ['rating', 'body', 'acidity']
target = 'price'

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


categorical_pipeline = Pipeline(
        [
            ('ohe', OneHotEncoder())
        ]
    )

numerical_pipeline = Pipeline(
        [
            ('scaler', StandardScaler())
        ]
    )

# normalize num cols and ohe cat cols
transformer = ColumnTransformer(
    [
        (
            'categorical_pipeline',
            categorical_pipeline,
            cat_cols,
        ), 
        (
            'numerical_pipeline',
            numerical_pipeline,
            num_cols,
        )
    ]
)

In [23]:
config_path = "../configs/train_config_notebook.yaml"
params = read_training_pipeline_params(config_path)

df = pd.read_csv(params.input_data_path)
df.dropna(inplace=True)

print(df.shape)
df.head()

Check schema: TrainingPipelineParams(output_data_featurized_path='../data/data_featurized.csv', output_data_train_path='../data/train.csv', output_data_test_path='../data/test.csv', output_target_train_path='../data/train_target.csv', output_target_test_path='../data/test_target.csv', output_model_path='../models/model.pkl', output_transformer_path='../models/transformer.pkl', metric_path='../models/metrics.json', splitting_params=SplittingParams(val_size=0.2, random_state=42), feature_params=FeatureParams(categorical_features=['winery', 'wine', 'year', 'region', 'type'], numerical_features=['rating', 'body', 'acidity'], target_col='price'), train_params=TrainingParams(model_type='RandomForestClassifier', random_state=42, max_depth=10, solver='lbfgs', C=1.0), input_data_path='../data/wines_SPA.csv')
Check schema: TrainingPipelineParams(output_data_featurized_path='../data/data_featurized.csv', output_data_train_path='../data/train.csv', output_data_test_path='../data/test.csv', output_

Unnamed: 0,winery,wine,year,rating,num_reviews,country,region,price,type,body,acidity
0,Teso La Monja,Tinto,2013,4.9,58,Espana,Toro,995.0,Toro Red,5.0,3.0
1,Artadi,Vina El Pison,2018,4.9,31,Espana,Vino de Espana,313.5,Tempranillo,4.0,2.0
2,Vega Sicilia,Unico,2009,4.8,1793,Espana,Ribera del Duero,324.95,Ribera Del Duero Red,5.0,3.0
3,Vega Sicilia,Unico,1999,4.8,1705,Espana,Ribera del Duero,692.96,Ribera Del Duero Red,5.0,3.0
4,Vega Sicilia,Unico,1996,4.8,1309,Espana,Ribera del Duero,778.06,Ribera Del Duero Red,5.0,3.0


In [29]:
catdf = pd.DataFrame(categorical_pipeline.fit_transform(df[cat_cols]).toarray())
print(catdf.shape)
catdf

(6329, 1321)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1311,1312,1313,1314,1315,1316,1317,1318,1319,1320
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
numdf = pd.DataFrame(numerical_pipeline.fit_transform(df[num_cols]))
print(numdf.shape)
numdf

(6329, 3)


Unnamed: 0,0,1,2
0,5.153614,1.442967,0.214761
1,5.153614,-0.271420,-3.818549
2,4.349085,1.442967,0.214761
3,4.349085,1.442967,0.214761
4,4.349085,1.442967,0.214761
...,...,...,...
6324,-0.478090,-0.271420,0.214761
6325,-0.478090,-0.271420,0.214761
6326,-0.478090,-0.271420,0.214761
6327,-0.478090,1.442967,0.214761


In [28]:
"""
separately numerical/ categorical pipelines are working + we can pd.concat dfs
simplified pipeline from ColumnTransformer not working
"""
transdf = pd.DataFrame(transformer.fit_transform(df))
print(transdf.shape)
transdf.head()

(6329, 1)


Unnamed: 0,0
0,"(0, 371)\t1.0\n (0, 1092)\t1.0\n (0, 1224)..."
1,"(0, 30)\t1.0\n (0, 1139)\t1.0\n (0, 1229)\..."
2,"(0, 395)\t1.0\n (0, 1108)\t1.0\n (0, 1220)..."
3,"(0, 395)\t1.0\n (0, 1108)\t1.0\n (0, 1210)..."
4,"(0, 395)\t1.0\n (0, 1108)\t1.0\n (0, 1207)..."


In [34]:
transdf = pd.concat([numdf, catdf], axis=1)
print(transdf.shape)
transdf.head()

(6329, 1324)


Unnamed: 0,0,1,2,0.1,1.1,2.1,3,4,5,6,...,1311,1312,1313,1314,1315,1316,1317,1318,1319,1320
0,5.153614,1.442967,0.214761,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,5.153614,-0.27142,-3.818549,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,4.349085,1.442967,0.214761,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.349085,1.442967,0.214761,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.349085,1.442967,0.214761,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
