# Advanced Feature Engineering I
<hr style="border:2px solid black">

## Penguin Data

**load packages**

In [None]:
# data analysis stack
import numpy as np
import pandas as pd

# machine-learning stack
from sklearn.metrics import r2_score  
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler,
    PolynomialFeatures
)
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline

# miscellaneous
import warnings
warnings.filterwarnings("ignore")

from sklearn import set_config
set_config(transform_output='pandas')

## Business Goal:
>Predict the body mass based on the other feature

**read data**

In [None]:
df = pd.read_csv('./data/penguins.csv')
df

## Select columns for X and y

In [None]:
# keep a copy 
df_copy = df.copy()

In [None]:
X = df_copy.drop(['body_mass_g'],axis=1)
X_ash = df_copy.drop(columns=['body_mass_g'])
y = df_copy["body_mass_g"]

### Train-Test split

In [None]:
RSEED = 32

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, random_state= RSEED, shuffle=True, test_size=0.2)
Xtrain.shape, ytrain.shape

## EDA
- which variable has missing values?
- which variables are binary, categorical, metric?
- do categorical variables have non-numeric values?
- do metric features are varying on a different scale?
- ...

In [None]:
# Combine back Xtrain, ytrain
df_train = pd.concat([Xtrain,ytrain],axis=1)
df_train

In [None]:
# check for missing value 

In [None]:
df_train.isnull().sum()

In [None]:
df_train

In [None]:
# check for correlation


<hr style="border:2px solid black">

## Feature Engineering

- `Pipeline` combines ML steps in series
- `ColumnTransformer` combines ML steps in parallel

## Preprocessing


Which colum we want to transform:
+ sex (categorical nominal) --> imputation, encoding
+ flipper lenght (numerical continuous) --> imputation (median or mean), then scaling or binning
+ bill depth (numerical continuous) --> imputation (median or mean), then scaling or binning
+ bill lenght (numerical continuous) --> scaling or binning


In [None]:
# instantiate the transformation
sex_imputer = SimpleImputer(strategy='most_frequent')

In [None]:
sex_imputer.fit(Xtrain[['sex']])

In [None]:
# get the most frequent
sex_imputer.statistics_

In [None]:
# applay the transformation
sex_col_imputed = sex_imputer.transform(Xtrain[['sex']])
sex_col_imputed

In [None]:
# instantiate the transformation
ohe = OneHotEncoder(drop='first', sparse_output=False)

In [None]:
ohe.fit(sex_col_imputed[['sex']])

In [None]:
ohe.fit(sex_col_imputed) # the same as the previous cell 

In [None]:
# getting the categories
ohe.categories_

In [None]:
imputed_ohe = ohe.transform(sex_col_imputed[['sex']])
imputed_ohe

In [None]:
sex_transf = Pipeline(steps=[
    ('sex_imputation', SimpleImputer(strategy='most_frequent')),
    ('sex_ohe', OneHotEncoder(drop='first', sparse_output=False))
])

In [None]:
sex_transf.fit(Xtrain[['sex']])

In [None]:
pipeline_sex_encoded_imputed = sex_transf.transform(Xtrain[['sex']])
pipeline_sex_encoded_imputed

In [None]:
sex_transf.get_feature_names_out()

In [None]:
#'bill_length_mm'
scaler = StandardScaler()

In [None]:
scaler.fit(Xtrain[['bill_length_mm']])

In [None]:
scaler.mean_, scaler.scale_

In [None]:
trasformed_bill_lenght = scaler.transform(Xtrain[['bill_length_mm']],)
trasformed_bill_lenght

In [None]:
scaler.get_feature_names_out()

In [None]:
Xtrain_fe = pd.concat([trasformed_bill_lenght,pipeline_sex_encoded_imputed],axis=1)
Xtrain_fe

In [None]:
sex_transf

In [None]:
scaler

In [None]:
Xtrain.isna().sum()

### ColumnTransfomer

In [None]:
fe_transformer = ColumnTransformer(transformers=[
    ('transf_cat', sex_transf, ['sex', 'island'] ),
    ('bill_lenght', scaler , ['bill_length_mm'])
],remainder='drop')
fe_transformer

In [None]:
fe_transformer.fit(Xtrain)

In [None]:
X_train_transformed = fe_transformer.transform(Xtrain)
X_train_transformed

In [None]:
X_test_transformed = fe_transformer.transform(Xtest)
X_test_transformed

### Model Building

**instantiate model**

In [None]:
model = LinearRegression()

**train model**

In [None]:
model.fit(X_train_transformed,ytrain)

### Model Evaluation

**model performance**

In [None]:
print('r2 on train',model.score(X_train_transformed,ytrain))
print('r2 on test',model.score(X_test_transformed,ytest))

In [None]:
y_predict_train = model.predict(X_train_transformed)
y_predict_test = model.predict(X_test_transformed)

In [None]:
import matplotlib.pyplot as plt


In [None]:
plt.scatter(y_predict_train, ytrain)
plt.xlabel('ypred')
plt.ylabel('ytrue')

## References

- [How to add feature engineering to a scikit-learn pipeline](https://practicaldatascience.co.uk/machine-learning/how-to-add-feature-engineering-to-a-scikit-learn-pipeline)

- [Coding a custom imputer in scikit-learn](https://towardsdatascience.com/coding-a-custom-imputer-in-scikit-learn-31bd68e541de)

<hr style="border:2px solid black">

#### Doing featuring eng and modeling in one go

In [None]:
Pipeline(steps=[
    ('fe_engineering',fe_transformer),
    ('linear_regression', LinearRegression())
])

In [None]:
linear_model_one_line = Pipeline(steps=[
    ('fe_engineering',fe_transformer),
    ('linear_regression', LinearRegression())
])
linear_model_one_line.fit(Xtrain,ytrain);

In [None]:
linear_model_one_line.score(Xtest,ytest)

In [None]:
linear_model_one_line.predict(Xtest)

# Exercise: engineering other features