# Advanced Feature Engineering I
<hr style="border:2px solid black">

## Penguin Data

**load packages**

In [1]:
# data analysis stack
import numpy as np
import pandas as pd

# machine-learning stack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler,
    PolynomialFeatures
)
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

# miscellaneous
import warnings
warnings.filterwarnings("ignore")

**read data**

In [None]:
df = pd.read_csv('../data/penguins.csv')
df.head()

### Train-Test split

In [3]:
train,test = train_test_split(df, test_size=0.2, random_state=101)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [None]:
train.head()

In [None]:
train.info()

### Feature-Target Separation

In [9]:
# target column name
target = 'body_mass_g'

In [10]:
# feature column names
features = list(set(train.columns) - set([target]))
print(features)

['sex', 'island', 'bill_length_mm', 'species', 'flipper_length_mm', 'bill_depth_mm']


In [19]:
# Splitting the DataFrame into Features and Target:
X_train, y_train = train[features], train[target]
X_train.shape, y_train.shape

((273, 6), (273,))

In [20]:
cat_features = list(X_train.select_dtypes(include=['object']).columns)
cat_features

['sex', 'island', 'species']

In [21]:
num_features = list(X_train.select_dtypes(exclude=['object']).columns)
num_features

['bill_length_mm', 'flipper_length_mm', 'bill_depth_mm']

<hr style="border:2px solid black">

## Feature Engineering

- `Pipeline` combines ML steps in series
- `ColumnTransformer` combines ML steps in parallel

### Preprocessing Pipelines

**numerical columns**

In [42]:
num_transformer = make_pipeline(
        SimpleImputer(strategy='mean'),  # Replaces missing values with the mean of the respective column
        StandardScaler(),                # Standardizes the features by removing the mean and scaling to unit variance.   
        PolynomialFeatures(degree=2)     
    )

In [43]:
type(num_transformer)

sklearn.pipeline.Pipeline

**categorical columns**

In [33]:
cat_transformer = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(drop='first', handle_unknown='ignore')
    )

In [34]:
type(cat_transformer)

sklearn.pipeline.Pipeline

**total preprocessing**

In [35]:
preprocessor = make_column_transformer(
        (num_transformer, num_features),
        (cat_transformer, cat_features)
)

In [36]:
type(preprocessor)

sklearn.compose._column_transformer.ColumnTransformer

### Model Building

**instantiate model**

In [37]:
linear_model = make_pipeline(
    preprocessor,
    LinearRegression()
    )

**train model**

In [38]:
linear_model.fit(X_train,y_train)

**model validation**

In [39]:
training_score = linear_model.score(X_train,y_train)
print(f"training r2 score: {round(training_score, 6)}")

training r2 score: 0.879775


### Model Evaluation

**feature-target separation**

In [40]:
X_test, y_test = test[features], test[target]


**model performance**

In [41]:
test_score = linear_model.score(X_test,y_test)
print(f"test r2 score: {round(test_score, 6)}")

test r2 score: 0.845228


## References

- [How to add feature engineering to a scikit-learn pipeline](https://practicaldatascience.co.uk/machine-learning/how-to-add-feature-engineering-to-a-scikit-learn-pipeline)

- [Coding a custom imputer in scikit-learn](https://towardsdatascience.com/coding-a-custom-imputer-in-scikit-learn-31bd68e541de)

<hr style="border:2px solid black">