In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("SampleFile.csv")
df = df.select_dtypes(include=np.number)

In [None]:
df.sample(5, random_state=42)

Unnamed: 0,LotArea,MSSubClass
924,10240,20
1335,9650,20
659,9937,20
617,7227,45
1093,9230,20


## Max Absolute Scaling

- Formula: new_value = original_value / max_abs_value
- Output range: [-1, 1]
- Preserves sparsity by scaling each feature with its maximum absolute value
- Helpful when features contain many zeros

In [None]:
from sklearn.preprocessing import MaxAbsScaler

scaler = MaxAbsScaler()
scaled_array = scaler.fit_transform(df)
df_scaled = pd.DataFrame(scaled_array, columns=df.columns)

df_scaled.head()

       LotArea  MSSubClass
0     0.039258    0.315789
1     0.044600    0.105263
2     0.052266    0.315789
3     0.044368    0.368421
4     0.066250    0.315789
...        ...         ...
1455  0.036781    0.315789
1456  0.061209    0.105263
1457  0.042008    0.368421
1458  0.045144    0.105263
1459  0.046166    0.105263

[1460 rows x 2 columns]


# Min-Max Scaling

- Formula: new_value = (original_value - min_value) / (max_value - min_value)
- Output range: [0, 1]
- Keeps relative ordering but is sensitive to outliers
- Useful when a model expects bounded features

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_array = scaler.fit_transform(df)
df_scaled = pd.DataFrame(scaled_array, columns=df.columns)

df_scaled.head()

       LotArea  MSSubClass
0     0.033420    0.235294
1     0.038795    0.000000
2     0.046507    0.235294
3     0.038561    0.294118
4     0.060576    0.235294
...        ...         ...
1455  0.030929    0.235294
1456  0.055505    0.000000
1457  0.036187    0.294118
1458  0.039342    0.000000
1459  0.040370    0.000000

[1460 rows x 2 columns]


# Standardization (Z-Score)

- Formula: new_value = (original_value - mean) / standard_deviation
- Produces features with mean 0 and standard deviation 1
- Works well for models that assume normally distributed inputs
- Moderately sensitive to outliers

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_array = scaler.fit_transform(df)
df_scaled = pd.DataFrame(scaled_array, columns=df.columns)

df_scaled.head()

       LotArea  MSSubClass
0    -0.207142    0.073375
1    -0.091886   -0.872563
2     0.073480    0.073375
3    -0.096897    0.309859
4     0.375148    0.073375
...        ...         ...
1455 -0.260560    0.073375
1456  0.266407   -0.872563
1457 -0.147810    0.309859
1458 -0.080160   -0.872563
1459 -0.058112   -0.872563

[1460 rows x 2 columns]


# Robust Scaling

- Formula: new_value = (original_value - median) / IQR
- Median corresponds to Q2 and IQR = Q3 - Q1
- Reduces the influence of outliers by using percentiles
- Useful when feature distributions contain extreme values

In [None]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
scaled_array = scaler.fit_transform(df)
df_scaled = pd.DataFrame(scaled_array, columns=df.columns)

df_scaled.head()

       LotArea  MSSubClass
0    -0.254076         0.2
1     0.030015        -0.6
2     0.437624         0.2
3     0.017663         0.4
4     1.181201         0.2
...        ...         ...
1455 -0.385746         0.2
1456  0.913167        -0.6
1457 -0.107831         0.4
1458  0.058918        -0.6
1459  0.113266        -0.6

[1460 rows x 2 columns]
