In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

In [None]:
######################################################################################
# Preprocessing
######################################################################################

In [None]:
# ===================================================================================
# Concept: Feature Scaling
# ===================================================================================
# Goal: Transform features to have a similar scale or range (e.g., 0 to 1).
#
# Intuition:
# Without scaling, features with large numerical values (e.g., house price in
# millions) dominate features with small values (e.g., number of rooms),
# even if the small-value feature is more important.
#
# Benefits:
# 1. Removes the effect of measurement units.
# 2. Ensures fair contribution of all features to model calculations,
#    crucial for distance-based algorithms like KNN.
# ===================================================================================
# Types of scaling :
# ===================================================================================
# min max scaling
# standard scaling
# z-score scaling
# ===================================================================================

In [None]:
# ==============================================================================
# Concept: Min-Max Scaling (Normalization)
# ==============================================================================
# Goal: Scales data to a fixed range, typically 0 to 1.
# Formula: X_scaled = (X - X_min) / (X_max - X_min)
#
# Intuition:
# It maps the minimum value in the data to 0 and the maximum value to 1,
# rescaling everything else proportionally in between.
#
# Technical Requirement:
# Scikit-learn scalers require 2D input (n_samples, n_features).
# - Use .reshape(-1, 1) to convert 1D data into a 2D array with 
#   one column and multiple rows before scaling.
# ==============================================================================

x = np.array([
    [1, 10000000],
    [2, 20000000],
    [3, 30000000],
    [4, 40000000]
])

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

# ==============================================================================
# Concept: Scikit-learn Scaling Workflow (fit vs transform)
# ==============================================================================
# - fit():    Calculates the minimum and maximum values for each feature
#             needed for scaling. It does NOT change the data.
# - transform(): Uses the calculated min/max values to scale the data
#                to the target range.
# - fit_transform(): Performs both steps efficiently in one go.
#
# Custom Range:
# To change the scale range (default is 0-1), use:
# scaler = MinMaxScaler(feature_range=(new_min, new_max))
# ==============================================================================

In [None]:
# ==============================================================================
# Concept: Standard Scaling (Standardization)
# ==============================================================================
# Goal: Transform data to have a mean of 0 and a standard deviation of 1.
#
# Intuition:
# It centers the data around zero and scales it based on its variance.
# Unlike Min-Max, it does not bind data to a specific range (like 0-1),
# but makes features comparable based on their distribution.
#
# Requirement:
# Input must be a 2D array (n_samples, n_features) for scikit-learn.
# ==============================================================================

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [None]:
# ==============================================================================
# Concept: Z-score Scaling (Statistical Definition)
# ==============================================================================
# - Goal: Similar to Standard Scaling, centers data to mean 0, std 1.
# - Difference: Often refers to using the Population Standard Deviation,
#   whereas StandardScaler usually uses Sample Standard Deviation.
# - Practice: In ML, these terms are often used interchangeably.
#
# SciPy Implementation:
# - Use scipy.stats.zscore(x, ddof=1) to explicitly use Sample Standard
#   Deviation (dividing by N-1 instead of N).
# ==============================================================================

from scipy.stats import zscore
x = zscore(x, ddof=1)  # ddof=1 for sample standard deviation