* [link](https://nbviewer.org/github/queirozfcom/python-sandbox/blob/master/python3/notebooks/pipelines-custom-steps/main.ipynb#sparse-to-dense-matrix)

In [6]:
# custom transformer on select dataframe columns

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline

In [4]:
class SelectColumnsTransformer():
    def __init__(self, columns=None):
        self.columns=columns
    
    def transform(self, X, **transform_params):
        cpy_df= X[self.columns].copy()
        return cpy_df
    
    def fit(self, X, y=None, **fit_params):
        return self

In [7]:
df = pd.DataFrame({
    'name':['alice','bob','charlie','david','edward'],
    'age':[24,32,np.nan,38,20]
})

df.head()

Unnamed: 0,name,age
0,alice,24.0
1,bob,32.0
2,charlie,
3,david,38.0
4,edward,20.0


In [8]:
pipe = Pipeline([('selector', SelectColumnsTransformer(['name']))])

In [9]:
pipe.fit_transform(df)

Unnamed: 0,name
0,alice
1,bob
2,charlie
3,david
4,edward


In [10]:
# imputing missing values 
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [11]:
transformer_step = ColumnTransformer([('impute_mean', SimpleImputer(strategy='mean'), ['age'])], remainder='passthrough')

In [12]:
pipe=Pipeline([('transformer', transformer_step)])

In [13]:
pipe.fit(df)

In [15]:
pd.DataFrame(
    data=pipe.transform(df),
    columns=['age', 'name']
)[["name","age"]]

Unnamed: 0,name,age
0,alice,24.0
1,bob,32.0
2,charlie,28.5
3,david,38.0
4,edward,20.0


In [16]:
# column transform with onehot encoder 
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier


In [17]:
df = pd.DataFrame({
    'favorite_color':['blue','green','red','green','blue'],
    'age': [10,15,10,np.nan,10],
    'target':[1,0,1,0,1]
})

In [18]:
categorical_preprocessing = Pipeline([('ohe', OneHotEncoder())])
numerical_preprocessing = Pipeline([('imputation', SimpleImputer())])

# which transformer for which columns
preprocess = ColumnTransformer([
    ('categorical_preprocessing', categorical_preprocessing, ['favorite_color']),
    ('numerical_preprocessing', numerical_preprocessing, ['age'])
])

# pipeline with preprocess and classifier 
pipeline = Pipeline([
    ('preprocess', preprocess),
    ('clf', DecisionTreeClassifier())
])

# fitting the pipeline using the whole dataframe
df_features = df[['favorite_color','age']]
df_target = df['target']

pipeline.fit(df_features, df_target)

In [23]:
# function Transformer can be used to use any function to the input data
from sklearn.preprocessing import FunctionTransformer
def log_transform(X): #create own function 
    X_log = np.log(X)
    return X_log

log_transform_object = FunctionTransformer(log_transform) #turn into object with FunctionTrasnformer

In [None]:
log_x2 = ColumnTransformer([('log_x2', log_transform_object, ['x2'])], #pass through in column Transformer - used when own function only uses .transform
                           remainder='passthrough')   # passthrough means leave all other columns unchanged
pipeline = Pipeline(steps=[('preprocessing', log_x2), 
                           ('regression', LinearRegression())]) # not for this data 
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Test set RMSE: {rmse}')
pipeline.steps[1][1].coef_