In [16]:
import numpy as np
import pandas as pd
from custom_transformers import add_combined_column, CombineAttributes, Square
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from warnings import filterwarnings

filterwarnings("ignore")

In [17]:
# Definimos um conjunto arbitrário para teste
numbers_array = np.array([1, 2, 3, 4, 5])
    
# Integramos nossa nova classe na API do scikit-learn
pipeline_quadrado = Pipeline([
    ("SquaredNumbers", Square())
])

pipeline_quadrado.fit_transform(numbers_array)

array([ 1,  4,  9, 16, 25])

In [24]:
pipeline_discreto = Pipeline([
    ("SumNumbers", FunctionTransformer(lambda x: x+10))
])

pipeline_discreto.fit_transform(numbers_array)

array([11, 12, 13, 14, 15])

In [18]:
data = pd.DataFrame({
    "Name": ["Adriano", "Ana", "Luis", "Guilherme", "José", "Maria", "Clara", "Leonardo", "Alice", "Júlia"],
    "Dorm": ["A", "B", "C", "A", "B", "C", "A", "B", "C", np.nan],
    "Notebooks": [2, 1, np.nan, 3, 2, 4, 1, 1, 2, 3]
})
data

Unnamed: 0,Name,Dorm,Notebooks
0,Adriano,A,2.0
1,Ana,B,1.0
2,Luis,C,
3,Guilherme,A,3.0
4,José,B,2.0
5,Maria,C,4.0
6,Clara,A,1.0
7,Leonardo,B,1.0
8,Alice,C,2.0
9,Júlia,,3.0


In [19]:
# Usando Pandas - Rápido e fácil. Perfeito para análise exploratória.
data_merged = add_combined_column(data, "Dorm", "Notebook per Dorm")
data_merged

Unnamed: 0,Name,Dorm,Notebooks,Notebook per Dorm
0,Adriano,A,2.0,6.0
1,Ana,B,1.0,4.0
2,Luis,C,,6.0
3,Guilherme,A,3.0,6.0
4,José,B,2.0,4.0
5,Maria,C,4.0,6.0
6,Clara,A,1.0,6.0
7,Leonardo,B,1.0,4.0
8,Alice,C,2.0,6.0
9,Júlia,,3.0,


In [20]:
# Usando Scikit Learn - Possível de integrar ao Pipeline. Boa prática e melhor deploy.
Combiner = CombineAttributes(to_integer=False)
Combiner.transform(data)

Unnamed: 0,Name,Dorm,Notebooks,Notebook per Dorm
0,Adriano,A,2.0,6.0
1,Ana,B,1.0,4.0
2,Luis,C,,6.0
3,Guilherme,A,3.0,6.0
4,José,B,2.0,4.0
5,Maria,C,4.0,6.0
6,Clara,A,1.0,6.0
7,Leonardo,B,1.0,4.0
8,Alice,C,2.0,6.0
9,Júlia,,3.0,


In [21]:
# Integrando um transformador personalizado ao Pipeline
main_pipe = Pipeline([
    
    ("Missing Inputing", ColumnTransformer([
        ("ImputerDorm", SimpleImputer(strategy="constant", fill_value="D"), ["Dorm"]),
        ("ImputerNotebook", SimpleImputer(strategy="constant", fill_value=0), ["Notebooks"])],
        remainder="drop",
        verbose_feature_names_out=False).set_output(transform="pandas")),

    ("Transformador personalizado", CombineAttributes()),

])

main_pipe

In [22]:
main_pipe.fit_transform(data)

Unnamed: 0,Dorm,Notebooks,Notebook per Dorm
0,A,2.0,6
1,B,1.0,4
2,C,0.0,6
3,A,3.0,6
4,B,2.0,4
5,C,4.0,6
6,A,1.0,6
7,B,1.0,4
8,C,2.0,6
9,D,3.0,3
