# Linear Regression
will test OLS and SGD using random data sets, same seed for consetenticy 


In [1]:
import numpy as np
from matplotlib import pyplot as plt

In [None]:
# Parameters / assumptions
n_samples = 2000
seed = 42
rng = np.random.RandomState(seed)
noise_std = 3.0 # Standard deviation of the Gaussian noise

# Generate 1D feature X in range [-10, 10]
X = rng.uniform(-10.0, 10.0, size=(n_samples, 1))

# True linear model parameters
true_w = 3.0    # slope
true_b = 5.0    # intercept

# Targets with Gaussian noise, such that y = Mx + b + epsilon(noise)
noise = rng.normal(0.0, noise_std, size=(n_samples, 1))
y = true_w * X + true_b + noise
y = y.ravel()

# Quick visualization of the generated data
print("X shape:", X.shape, "y shape:", y.shape)
plt.figure(figsize=(7,4))
plt.scatter(X, y, s=8, alpha=0.6)
plt.xlabel("X")
plt.ylabel("y")
plt.title(f"Generated data with noise (n={n_samples})")
plt.grid(alpha=0.3)
plt.show()

Fit the line in OLS & SGD

In [None]:
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# two approaches to linear regression: OLS and SGD

# 1. OLS using LinearRegression
ols_model = LinearRegression()
ols_model.fit(X, y)

# 2. SGD using SGDRegressor

# SGD is sensitive to the scale of data, so we scale X and y 
scaler_y = StandardScaler()
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).ravel()

# Initialize and train the SGD model
# We set max_iter and tolerance for convergence
sgd_model = SGDRegressor(
    loss='squared_error',
    alpha=0.0001,
    max_iter=1000,
    tol=1e-3,
    random_state=42
)
# Fit the model on scaled data
sgd_model.fit(X_scaled, y_scaled)

# The SGD coefficients are for the *scaled* data. We need to convert them back
# to the original scale for comparison.
sgd_w_scaled = sgd_model.coef_[0]
sgd_b_scaled = sgd_model.intercept_[0]

# Unscale the weights (w_orig = w_scaled * (std_y / std_x))
sgd_w_orig = sgd_w_scaled * (scaler_y.scale_ / scaler_X.scale_)
# Unscale the intercept (b_orig = b_scaled * std_y + mean_y - w_orig * mean_x)
sgd_b_orig = sgd_b_scaled * scaler_y.scale_ + scaler_y.mean_ - sgd_w_orig * scaler_X.mean_

# --- Results ---
# Prepare predictions for plotting and metrics (using unscaled models for consistency)
y_pred_ols = ols_model.predict(X)

# For SGD, use the unscaled parameters to calculate predictions on unscaled X
y_pred_sgd = sgd_w_orig * X.ravel() + sgd_b_orig

print("## ðŸ“Š Model Comparison")
print(f"| Metric | True Value | OLS (LinearRegression) | SGDRegressor (Unscaled) |")
print(f"| :--- | :--- | :--- | :--- |")
print(f"| **Slope (w)** | {true_w:.4f} | {ols_model.coef_[0]:.4f} | {sgd_w_orig[0]:.4f} |")
print(f"| **Intercept (b)** | {true_b:.4f} | {ols_model.intercept_:.4f} | {sgd_b_orig[0]:.4f} |")
print(f"| **RMSE** | N/A | {mean_squared_error(y, y_pred_ols, squared=False):.4f} | {mean_squared_error(y, y_pred_sgd, squared=False):.4f} |")

plt.figure(figsize=(8, 5))
plt.scatter(X, y, s=8, alpha=0.5, label='Generated Data')
plt.plot(X, true_w * X + true_b, color='black', linestyle='--', linewidth=2, label='True Line')
plt.plot(X, y_pred_ols, color='red', linestyle='-', linewidth=2, label=f'OLS Fit (w={ols_model.coef_[0]:.2f})')
plt.plot(X, y_pred_sgd, color='green', linestyle='-', linewidth=1.5, alpha=0.7, label=f'SGD Fit (w={sgd_w_orig[0]:.2f})')

plt.xlabel("X")
plt.ylabel("y")
plt.title("Comparison of OLS and SGD Linear Fits")
plt.legend()
plt.grid(alpha=0.3)
plt.show()