# Custom Transformer

## Imports

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.pipeline import  make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn import set_config; set_config(display='diagram')

In [2]:
data = pd.read_csv("https://wagon-public-datasets.s3.amazonaws.com/05-Machine-Learning/08-Workflow/custom_transformer_data.csv")
data.head()

Unnamed: 0,customer_state,seller_state,product_weight_g,product_length_cm,product_height_cm,product_width_cm,days_until_delivery
0,RJ,SP,1825,53,10,40,9
1,RJ,SP,700,65,18,28,9
2,RJ,SP,1825,53,10,40,11
3,RJ,SP,1825,53,10,40,12
4,RJ,SP,1825,53,10,40,14


## Pipeline

👇 Create a scikit-learn pipeline named `pipe`:

- Engineer a `volume` feature from the dimensions features
- Preserve the original product dimensions features for training
- Scale all numerical features
- Encode the categorical features
- Add a default `Ridge` regression estimator

**Note:** for this challenge, ignore the holdout method, so no need to `train_test_split`!

In [3]:
X = data.drop(columns="days_until_delivery")
y = data["days_until_delivery"]

In [4]:
volume = FunctionTransformer(
    lambda data: pd.DataFrame(
        data["product_length_cm"]*data["product_height_cm"]*data["product_width_cm"]
    )
)
volume

In [5]:
volume_pipe = Pipeline([
    ("volume", volume),
    ("scaled_volume", StandardScaler())
])

volume_pipe

In [6]:
preprocessor = ColumnTransformer([
    ("num_tr", StandardScaler(), ["product_weight_g","product_length_cm","product_height_cm","product_width_cm"]),
    ("division", OneHotEncoder(sparse= False, handle_unknown="ignore"), ["customer_state","seller_state"]),
    ("volume", volume_pipe, ["product_length_cm","product_height_cm","product_width_cm"])
])
preprocessor

In [7]:
pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", Ridge())
])

pd.DataFrame(preprocessor.fit_transform(X))
pipe



## Train and Predict

👇 Let's imagine `data` is your entire training set.

- `cross_validate` your pipeline on this dataset (❗️low r2 score are expected)
- Now, imagine you just received an new order `new_data`: predict it's duration of delivery in a variable `prediction`

In [8]:
new_data = pd.read_csv("https://wagon-public-datasets.s3.amazonaws.com/05-Machine-Learning/08-Workflow/custom_transformer_new_order.csv")
new_data

Unnamed: 0,customer_state,seller_state,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,RJ,SP,1825,53,10,40


In [9]:
pipe.fit(X, y)



In [10]:
prediction = pipe.predict(new_data)
prediction

array([20.68579164])