In [154]:
import numpy as np
import pandas as pd

In [175]:
df = pd.read_csv('data/scaling.csv')
df_scaled = df.copy(deep=True)
df

Unnamed: 0,FOR_MINMAX1,FOR_ROBOUST,FOR_STANDARD,FOR_MINMAX_2
0,10000000,6000000,420,420
1,10000000,5500000,1111,450
2,5000000,3500000,322,320
3,3000000,1000000,2342,165
4,2500000,42000,11223,120
5,1000000,800000,1232,150


In [176]:
columns_to_minmax = ['FOR_MINMAX1', 'FOR_MINMAX_2']
columns_to_standard = ['FOR_STANDARD',]
columns_to_roboust = ['FOR_ROBOUST']

# MinMaxScaler
Нормализация. Приведение значений признака в интервал [0, 1]
$$
x = \frac{x-min(x)}{max(x)-min(x)}
$$

In [177]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_scaled[columns_to_minmax] = scaler.fit_transform(df[columns_to_minmax])

# StandardScaler
Стандартизация. Приведение значений признака к нулевому среднему и стандартному отклонению равному единице.
$$
x = \frac{x - \mu}{\sigma}
$$
$\mu$ - среднее арифметическое\
$\sigma$ - стандартное отклонение

In [178]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaled[columns_to_standard] = scaler.fit_transform(df[columns_to_standard])

# RoboustScaler

Робастное масштабирование. Минимизация влияния выбросов

$$
x = \frac{x - X_{median}}{IQR}
$$

In [179]:
from sklearn.preprocessing import RobustScaler

In [180]:
scaler = RobustScaler()
df_scaled[columns_to_roboust] = scaler.fit_transform(df[columns_to_roboust])
df_scaled

Unnamed: 0,FOR_MINMAX1,FOR_ROBOUST,FOR_STANDARD,FOR_MINMAX_2
0,1.0,0.903614,-0.613991,0.909091
1,1.0,0.783133,-0.433835,1.0
2,0.444444,0.301205,-0.639541,0.606061
3,0.222222,-0.301205,-0.112891,0.136364
4,0.166667,-0.532048,2.202546,0.0
5,0.0,-0.349398,-0.402288,0.090909


# ColumnTransformer

In [407]:
from sklearn.compose import ColumnTransformer

Transformer = ColumnTransformer(
    transformers=[
        ('MinMax', MinMaxScaler(), columns_to_minmax),
        ('Standard', StandardScaler(), columns_to_standard),
        ('Roboust', RobustScaler(), columns_to_roboust)
    ],
    remainder='passthrough'
).set_output(transform='pandas')

Transformer

In [408]:
df_scaled = Transformer.fit_transform(df)
df_scaled

Unnamed: 0,MinMax__FOR_MINMAX1,MinMax__FOR_MINMAX_2,Standard__FOR_STANDARD,Roboust__FOR_ROBOUST
0,1.0,1.0,-0.613991,0.735751
1,1.0,0.916079,-0.433835,0.860104
2,0.444444,0.580396,-0.639541,0.321244
3,0.222222,0.160792,-0.112891,-0.321244
4,0.166667,0.0,2.202546,-0.507772
5,0.0,0.127224,-0.402288,-0.38342


In [409]:
print('mean:', df_scaled['Standard__FOR_STANDARD'].mean().round() )
print('std:', df_scaled['Standard__FOR_STANDARD'].std(ddof=0) )

mean: 0.0
std: 1.0


In [410]:
display(Transformer.feature_names_in_)
display(Transformer.get_feature_names_out())

array(['FOR_MINMAX1', 'FOR_MINMAX_2', 'FOR_STANDARD', 'FOR_ROBOUST'],
      dtype=object)

array(['MinMax__FOR_MINMAX1', 'MinMax__FOR_MINMAX_2',
       'Standard__FOR_STANDARD', 'Roboust__FOR_ROBOUST'], dtype=object)

`fit` - Запоминает статистики переданной выборки для трансформации\
`transform` - Трансформирует выборку исходя из запомненных статистик\
`fit_transform` - объединяет два шага

Для тестовой только `transform`! Во избежании утечки данных, предсказание должно делаться исходя из тех статистик на которых была тренировка.

In [411]:
def rename_after_ColumnTransformer(df, inplace=False):
    indices = df.columns
    split_names = pd.Index([x[1] for x in np.char.split(np.array(indices).astype('str'), '__')])
    if inplace:
        df.columns=split_names
        return df
    else:
        df_copy = df.copy(deep=True)
        df_copy.columns=split_names
        return df_copy

In [412]:
rename_after_ColumnTransformer(df_scaled, inplace=True)
df_scaled

Unnamed: 0,FOR_MINMAX1,FOR_MINMAX_2,FOR_STANDARD,FOR_ROBOUST
0,1.0,1.0,-0.613991,0.735751
1,1.0,0.916079,-0.433835,0.860104
2,0.444444,0.580396,-0.639541,0.321244
3,0.222222,0.160792,-0.112891,-0.321244
4,0.166667,0.0,2.202546,-0.507772
5,0.0,0.127224,-0.402288,-0.38342
