https://feature-engine.trainindata.com/en/latest/


In [18]:
!pip install feature-engine --quiet

In [21]:
import pandas as pd
from feature_engine.imputation import MeanMedianImputer
from sklearn.pipeline import Pipeline

df = pd.DataFrame({
    'Age': [22, 35, None, 58, 42, None],
    'Salary': [25000, 40000, 38000, None, 52000, None]
})

imputer = MeanMedianImputer(imputation_method='mean', variables=['Age', 'Salary'])

pipe = Pipeline([
    ('imputer', imputer)
])

pipe.fit(df)
pipe.transform(df)


Unnamed: 0,Age,Salary
0,22.0,25000.0
1,35.0,40000.0
2,39.25,38000.0
3,58.0,38750.0
4,42.0,52000.0
5,39.25,38750.0


In [30]:
from feature_engine.imputation import CategoricalImputer, RandomSampleImputer

df = pd.DataFrame({
    'Color': ['Red', 'Blue', None, 'Green', 'Blue', None],
    'Size': ['S', 'M', 'M', 'L', None, None],
    'Income': [40000, None, 60000, 45000, None, 70000],
})

cat_imp = CategoricalImputer(imputation_method='frequent', variables=['Color', 'Size'])
rand_imp = RandomSampleImputer(variables=['Income'], random_state=42)

pipe = Pipeline(steps=[
    ('cat_impute', cat_imp),
    ('num_impute', rand_imp),
])

df_imputed = pipe.fit_transform(df)
display(df)
display(df_imputed)

Unnamed: 0,Color,Size,Income
0,Red,S,40000.0
1,Blue,M,
2,,M,60000.0
3,Green,L,45000.0
4,Blue,,
5,,,70000.0


Unnamed: 0,Color,Size,Income
0,Red,S,40000.0
1,Blue,M,45000.0
2,Blue,M,60000.0
3,Green,L,45000.0
4,Blue,M,70000.0
5,Blue,M,70000.0


In [31]:
from feature_engine.encoding import OneHotEncoder, CountFrequencyEncoder

df = pd.DataFrame({
    'City': ['London', 'Paris', 'London', 'Berlin', 'Berlin', 'Paris'],
    'Brought': [1, 0, 0, 1, 0, 1]
})

onehot = OneHotEncoder(variables=['City'], drop_last=True)
count_enc = CountFrequencyEncoder(encoding_method='frequency', variables=['City'])

print(onehot.fit_transform(df))
print(count_enc.fit_transform(df))


   Brought  City_London  City_Paris
0        1            1           0
1        0            0           1
2        0            1           0
3        1            0           0
4        0            0           0
5        1            0           1
       City  Brought
0  0.333333        1
1  0.333333        0
2  0.333333        0
3  0.333333        1
4  0.333333        0
5  0.333333        1


In [41]:
from sklearn.preprocessing import OrdinalEncoder

df2 = pd.DataFrame({
    'Education': ["High School", "Bachelor", "Master", "PhD", "Bachelor", "Master"]
})

order = [["High School", "Bachelor", "Master", "PhD"]]

enc = OrdinalEncoder(categories=order, dtype=int)
df2['Education_ord'] = enc.fit_transform(df2[['Education']]).astype(int) + 1
print(df2)

     Education  Education_ord
0  High School              1
1     Bachelor              2
2       Master              3
3          PhD              4
4     Bachelor              2
5       Master              3


In [None]:
from feature_engine.transformation import LogTransformer
from feature_engine.discretisation import EqualFrequencyDiscretiser

df = pd.DataFrame({
    'Income': [10_000, 30_000, 50_000, 100_000, 200_000, 500_000]
})

log_df = LogTransformer(variables=['Income'])
log_df.fit_transform(df)

discretiser = EqualFrequencyDiscretiser(q=3, variables=['Income'])
discretiser.fit_transform(df)

"""
https://feature-engine.trainindata.com/en/latest/api_doc/discretisation/EqualWidthDiscretiser.html#feature_engine.discretisation.EqualWidthDiscretiser
"""

Unnamed: 0,Income
0,0
1,0
2,1
3,1
4,2
5,2
