In [5]:
# all_scalers_model_with_accuracy
# Compare multiple scaling techniques on a regression model

import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import (
    MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler,
    Normalizer, PowerTransformer, QuantileTransformer, FunctionTransformer
)
from sklearn.metrics import r2_score, mean_squared_error

# 1. Load dataset
data = fetch_california_housing(as_frame=True)
X = data.data
y = data.target

# 2. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Precompute variance of target for accuracy formula
y_var = np.var(y_test)

# 3. Define scalers
scalers = {
    "MinMaxScaler": MinMaxScaler(),
    "StandardScaler": StandardScaler(),
    "MaxAbsScaler": MaxAbsScaler(),
    "RobustScaler": RobustScaler(),
    "Normalizer (L2)": Normalizer(norm="l2"),
    "Log Transformation": FunctionTransformer(
        lambda x: np.log1p(np.maximum(x, 0))
    ),
    "PowerTransformer (Yeo-Johnson)": PowerTransformer(method="yeo-johnson"),
    "QuantileTransformer (Normal)": QuantileTransformer(output_distribution="normal", random_state=42)
}

results = []

# 4. Train model for each scaler
for name, scaler in scalers.items():
    try:
        # Fit scaler only on training data
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Train linear regression
        model = LinearRegression()
        model.fit(X_train_scaled, y_train)

        # Predict and evaluate
        y_pred = model.predict(X_test_scaled)
        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)

        # Accuracy as 1 - (MSE / Variance of y)
        accuracy = max(0, 1 - (mse / y_var)) * 100

        results.append({
            "Scaler": name,
            "R2 Score": round(r2, 4),
            "MSE": round(mse, 4),
            "Accuracy (%)": round(accuracy, 2)
        })
    except Exception as e:
        results.append({
            "Scaler": name,
            "R2 Score": None,
            "MSE": None,
            "Accuracy (%)": None,
            "Error": str(e)
        })

# 5. Convert to DataFrame
results_df = pd.DataFrame(results)

# 6. Display results
print("\nðŸ“Š Results of Different Scaling Techniques:\n")
print(results_df)



ðŸ“Š Results of Different Scaling Techniques:

                           Scaler  R2 Score     MSE  Accuracy (%)
0                    MinMaxScaler    0.5758  0.5559         57.58
1                  StandardScaler    0.5758  0.5559         57.58
2                    MaxAbsScaler    0.5758  0.5559         57.58
3                    RobustScaler    0.5758  0.5559         57.58
4                 Normalizer (L2)    0.2523  0.9798         25.23
5              Log Transformation    0.5545  0.5837         55.45
6  PowerTransformer (Yeo-Johnson)    0.5613  0.5749         56.13
7    QuantileTransformer (Normal)    0.5913  0.5356         59.13
