# Advanced Feature Engineering I
<hr style="border:2px solid black">

## Penguin Data

**load packages**

In [1]:
# data analysis stack
import numpy as np
import pandas as pd

# machine-learning stack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler,
    PolynomialFeatures
)
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

# miscellaneous
import warnings
warnings.filterwarnings("ignore")

**read data**

In [2]:
df = pd.read_csv('../data/penguins.csv')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
4,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


### Train-Test split

In [3]:
train,test = train_test_split(df, test_size=0.2, random_state=101)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [4]:
train.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Dream,36.5,18.0,182.0,3150.0,Female
1,Gentoo,Biscoe,47.2,15.5,215.0,4975.0,Female
2,Gentoo,Biscoe,46.3,15.8,215.0,5050.0,Male
3,Gentoo,Biscoe,50.4,15.3,224.0,5550.0,Male
4,Chinstrap,Dream,45.9,17.1,190.0,3575.0,Female


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273 entries, 0 to 272
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            273 non-null    object 
 1   island             273 non-null    object 
 2   bill_length_mm     273 non-null    float64
 3   bill_depth_mm      271 non-null    float64
 4   flipper_length_mm  272 non-null    float64
 5   body_mass_g        273 non-null    float64
 6   sex                266 non-null    object 
dtypes: float64(4), object(3)
memory usage: 15.1+ KB


### Feature-Target Separation

In [6]:
# target column name
target = 'body_mass_g'

In [7]:
# feature column names
features = list(set(train.columns) - set([target]))
print(features)

['bill_depth_mm', 'species', 'bill_length_mm', 'island', 'sex', 'flipper_length_mm']


In [8]:
# Splitting the DataFrame into Features and Target:
X_train, y_train = train[features], train[target]
X_train.shape, y_train.shape

((273, 6), (273,))

In [9]:
cat_features = list(X_train.select_dtypes(include=['object']).columns)
cat_features

['species', 'island', 'sex']

In [10]:
num_features = list(X_train.select_dtypes(exclude=['object']).columns)
num_features

['bill_depth_mm', 'bill_length_mm', 'flipper_length_mm']

<hr style="border:2px solid black">

## Feature Engineering

- `Pipeline` combines ML steps in series
- `ColumnTransformer` combines ML steps in parallel

### Preprocessing Pipelines

**numerical columns**

In [11]:
num_transformer = make_pipeline(
        SimpleImputer(strategy='mean'),  # Replaces missing values with the mean of the respective column
        StandardScaler(),                # Standardizes the features by removing the mean and scaling to unit variance.   
        PolynomialFeatures(degree=2)     
    )

In [12]:
type(num_transformer)

sklearn.pipeline.Pipeline

**categorical columns**

In [13]:
cat_transformer = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(drop='first', handle_unknown='ignore')
    )

In [14]:
type(cat_transformer)

sklearn.pipeline.Pipeline

**total preprocessing**

In [15]:
preprocessor = make_column_transformer(
        (num_transformer, num_features),
        (cat_transformer, cat_features)
)

In [16]:
type(preprocessor)

sklearn.compose._column_transformer.ColumnTransformer

### Model Building

**instantiate model**

In [17]:
linear_model = make_pipeline(
    preprocessor,
    LinearRegression()
    )

**train model**

In [18]:
linear_model.fit(X_train,y_train)

**model validation**

In [19]:
training_score = linear_model.score(X_train,y_train)
print(f"training r2 score: {round(training_score, 6)}")

training r2 score: 0.879775


### Model Evaluation

**feature-target separation**

In [20]:
X_test, y_test = test[features], test[target]


**model performance**

In [21]:
test_score = linear_model.score(X_test,y_test)
print(f"test r2 score: {round(test_score, 6)}")

test r2 score: 0.845228


## References

- [How to add feature engineering to a scikit-learn pipeline](https://practicaldatascience.co.uk/machine-learning/how-to-add-feature-engineering-to-a-scikit-learn-pipeline)

- [Coding a custom imputer in scikit-learn](https://towardsdatascience.com/coding-a-custom-imputer-in-scikit-learn-31bd68e541de)

<hr style="border:2px solid black">