### Robust scaling normalization

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler

In [None]:
# this type of normalization is useful when the data contains huge number of outliers

df = pd.DataFrame({
    'Age': np.random.randint(18, 60, 30),
    'Salary': np.random.randint(20000, 120000, 30),
    'Purchased': np.random.randint(0, 2, 30)
})

# adding some few extreme outliers
outliers = pd.DataFrame({
    'Age': [5, 90, 95],
    'Salary': [1000, 500000, 700000],
    'Purchased': [0, 1, 1]
})

# combining original data and outliers
df = pd.concat([df, outliers], ignore_index=True)

print("Data with Outliers:")
print(df.tail())

# split features and target
X = df[['Age', 'Salary']]
y = df['Purchased']

# train-test split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# apply robust scaling (less sensitive to outliers)
scaler = RobustScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# logistic regression model
model = LogisticRegression()
model.fit(x_train_scaled, y_train)

# predictions
y_pred = model.predict(x_test_scaled)

# accuracy
print(f"\nAccuracy Score: {np.round(accuracy_score(y_test, y_pred), 2) * 100} %")

Data with Outliers:
    Age  Salary  Purchased
28   23   62599          1
29   58  101488          0
30    5    1000          0
31   90  500000          1
32   95  700000          1

Accuracy Score: 30.0 %
