# Linear Regression
will test OLS and SGD using random data sets, same seed for consetenticy 


In [1]:
import numpy as np
from matplotlib import pyplot as plt

In [None]:
# Parameters / assumptions
n_samples = 2000
seed = 42
rng = np.random.RandomState(seed)
noise_std = 3.0 # Standard deviation of the Gaussian noise

# Generate 1D feature X in range [-10, 10]
X = rng.uniform(-10.0, 10.0, size=(n_samples, 1))

# True linear model parameters
true_w = 3.0    # slope
true_b = 5.0    # intercept

# Targets with Gaussian noise, such that y = Mx + b + epsilon(noise)
noise = rng.normal(0.0, noise_std, size=(n_samples, 1))
y = true_w * X + true_b + noise
y = y.ravel()

# Quick visualization of the generated data
print("X shape:", X.shape, "y shape:", y.shape)
plt.figure(figsize=(7,4))
plt.scatter(X, y, s=8, alpha=0.6)
plt.xlabel("X")
plt.ylabel("y")
plt.title(f"Generated data with noise (n={n_samples})")
plt.grid(alpha=0.3)
plt.show()

Fit the line in OLS & SGD

In [None]:
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# two approaches to linear regression: OLS and SGD

# 1. OLS using LinearRegression
ols_model = LinearRegression()
ols_model.fit(X, y)

# 2. SGD using SGDRegressor

# SGD is sensitive to the scale of data, so we scale X and y 
scaler_y = StandardScaler()
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).ravel()

# Initialize and train the SGD model
# We set max_iter and tolerance for convergence
sgd_model = SGDRegressor(
    loss='squared_error',
    alpha=0.0001,
    max_iter=1000,
    tol=1e-3,
    random_state=42
)
# Fit the model on scaled data
sgd_model.fit(X_scaled, y_scaled)

# The SGD coefficients are for the *scaled* data. We need to convert them back
# to the original scale for comparison.
sgd_w_scaled = sgd_model.coef_[0]
sgd_b_scaled = sgd_model.intercept_[0]

# Unscale the weights (w_orig = w_scaled * (std_y / std_x))
sgd_w_orig = sgd_w_scaled * (scaler_y.scale_ / scaler_X.scale_)
# Unscale the intercept (b_orig = b_scaled * std_y + mean_y - w_orig * mean_x)
sgd_b_orig = sgd_b_scaled * scaler_y.scale_ + scaler_y.mean_ - sgd_w_orig * scaler_X.mean_

# --- Results ---
# Prepare predictions for plotting and metrics (using unscaled models for consistency)
y_pred_ols = ols_model.predict(X)

# For SGD, use the unscaled parameters to calculate predictions on unscaled X
y_pred_sgd = sgd_w_orig * X.ravel() + sgd_b_orig

print("## üìä Model Comparison")
print(f"| Metric | True Value | OLS (LinearRegression) | SGDRegressor (Unscaled) |")
print(f"| :--- | :--- | :--- | :--- |")
print(f"| **Slope (w)** | {true_w:.4f} | {ols_model.coef_[0]:.4f} | {sgd_w_orig[0]:.4f} |")
print(f"| **Intercept (b)** | {true_b:.4f} | {ols_model.intercept_:.4f} | {sgd_b_orig[0]:.4f} |")
print(f"| **RMSE** | N/A | {mean_squared_error(y, y_pred_ols, squared=False):.4f} | {mean_squared_error(y, y_pred_sgd, squared=False):.4f} |")

plt.figure(figsize=(8, 5))
plt.scatter(X, y, s=8, alpha=0.5, label='Generated Data')
plt.plot(X, true_w * X + true_b, color='black', linestyle='--', linewidth=2, label='True Line')
plt.plot(X, y_pred_ols, color='red', linestyle='-', linewidth=2, label=f'OLS Fit (w={ols_model.coef_[0]:.2f})')
plt.plot(X, y_pred_sgd, color='green', linestyle='-', linewidth=1.5, alpha=0.7, label=f'SGD Fit (w={sgd_w_orig[0]:.2f})')

plt.xlabel("X")
plt.ylabel("y")
plt.title("Comparison of OLS and SGD Linear Fits")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

Add 20 outlier data points randomly distributed, set them and their target values far away from the original data point

In [None]:
# Assuming rng = np.random.RandomState(42) is defined.
# We'll use the same noise standard deviation (3.0) as the inliers for the random component.
noise_std = 3.0

# Parameters for Outliers
n_outliers = 20

# 1. Generate Outlier Features (X_outlier)
X_outliers = rng.uniform(-15.0, 15.0, size=(n_outliers, 1))

# 2. Generate Outlier Targets (y_outlier)
# Calculate the "correct" y value
y_outliers_base = 3.0 * X_outliers + 5.0

# Generate the intentional offset (+40 / -30)
intentional_offset = rng.choice([40.0, -30.0], size=(n_outliers, 1))

# Generate random Gaussian noise (epsilon) for the outliers
noise_outlier = rng.normal(0.0, noise_std, size=(n_outliers, 1))

# Final outlier target: y = (Mx + b) + Intentional_Offset + Epsilon_Noise
y_outliers = (y_outliers_base + intentional_offset + noise_outlier).ravel()

# 3. Combine Datasets, concatenating the original and outlier data
X_combined = np.vstack((X, X_outliers))
y_combined = np.hstack((y, y_outliers))

print(f"Original dataset size: {len(X)}")
print(f"Combined dataset size: {len(X_combined)} ({len(X)} original + {n_outliers} outliers)")

# Fit OLS on the combined data
ols_outlier_model = LinearRegression()
ols_outlier_model.fit(X_combined, y_combined)

# Original OLS fit parameters (from previous step for comparison)
# Assuming ols_model, true_w, true_b are available from the previous step
ols_clean_w = ols_model.coef_[0]
ols_clean_b = ols_model.intercept_

# --- Results ---
ols_outlier_w = ols_outlier_model.coef_[0]
ols_outlier_b = ols_outlier_model.intercept_

y_pred_outlier = ols_outlier_model.predict(X_combined)
ols_outlier_rmse = mean_squared_error(y_combined, y_pred_outlier, squared=False)

print("\n## ‚ö†Ô∏è Impact of Outliers on OLS")
print(f"| Metric | True Value | Clean OLS Fit | OLS Fit with {n_outliers} Outliers | Change |")
print(f"| :--- | :--- | :--- | :--- | :--- |")
print(f"| **Slope (w)** | {true_w:.4f} | {ols_clean_w:.4f} | **{ols_outlier_w:.4f}** | {(ols_outlier_w - true_w) / true_w * 100:.1f}% |")
print(f"| **Intercept (b)** | {true_b:.4f} | {ols_clean_b:.4f} | **{ols_outlier_b:.4f}** | {(ols_outlier_b - true_b) / true_b * 100:.1f}% |")
print(f"| **RMSE** | N/A | {mean_squared_error(y, ols_model.predict(X), squared=False):.4f} | **{ols_outlier_rmse:.4f}** | N/A |")

plt.figure(figsize=(9, 6))
plt.scatter(X, y, s=8, alpha=0.5, label='Original Data (2000 points)')
plt.scatter(X_outliers, y_outliers, s=25, color='red', marker='x', label=f'Outliers ({n_outliers} points)')

# Plot the clean model (for context)
plt.plot(X_combined, ols_clean_w * X_combined + ols_clean_b,
         color='green', linestyle='--', linewidth=2, label=f'Clean OLS Fit (w={ols_clean_w:.2f})')

# Plot the outlier-contaminated model
plt.plot(X_combined, ols_outlier_w * X_combined + ols_outlier_b,
         color='blue', linestyle='-', linewidth=2, label=f'Contaminated OLS Fit (w={ols_outlier_w:.2f})')

plt.xlabel("X")
plt.ylabel("y")
plt.title("Effect of Outliers on Ordinary Least Squares (OLS)")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

In [None]:
# baised outliers using only positive offsets
# Assuming rng = np.random.RandomState(42) is defined.
# We'll use the same noise standard deviation (3.0) as the inliers for the random component.
noise_std = 3.0

# Parameters for Outliers
n_outliers = 20

# 1. Generate Outlier Features (X_outlier)
X_outliers = rng.uniform(-15.0, 15.0, size=(n_outliers, 1))

# 2. Generate Outlier Targets (y_outlier)
# Calculate the "correct" y value
y_outliers_base = 3.0 * X_outliers + 5.0

# Generate the intentional offset (+40 / -30)
intentional_offset = rng.choice([40.0, 80], size=(n_outliers, 1))

# Generate random Gaussian noise (epsilon) for the outliers
noise_outlier = rng.normal(0.0, noise_std, size=(n_outliers, 1))

# Final outlier target: y = (Mx + b) + Intentional_Offset + Epsilon_Noise
y_outliers = (y_outliers_base + intentional_offset + noise_outlier).ravel()

# 3. Combine Datasets, concatenating the original and outlier data
X_combined = np.vstack((X, X_outliers))
y_combined = np.hstack((y, y_outliers))

print(f"Original dataset size: {len(X)}")
print(f"Combined dataset size: {len(X_combined)} ({len(X)} original + {n_outliers} outliers)")

# Fit OLS on the combined data
ols_outlier_model = LinearRegression()
ols_outlier_model.fit(X_combined, y_combined)

# Original OLS fit parameters (from previous step for comparison)
# Assuming ols_model, true_w, true_b are available from the previous step
ols_clean_w = ols_model.coef_[0]
ols_clean_b = ols_model.intercept_

# --- Results ---
ols_outlier_w = ols_outlier_model.coef_[0]
ols_outlier_b = ols_outlier_model.intercept_

y_pred_outlier = ols_outlier_model.predict(X_combined)
ols_outlier_rmse = mean_squared_error(y_combined, y_pred_outlier, squared=False)

print("\n## ‚ö†Ô∏è Impact of Outliers on OLS")
print(f"| Metric | True Value | Clean OLS Fit | OLS Fit with {n_outliers} Outliers | Change |")
print(f"| :--- | :--- | :--- | :--- | :--- |")
print(f"| **Slope (w)** | {true_w:.4f} | {ols_clean_w:.4f} | **{ols_outlier_w:.4f}** | {(ols_outlier_w - true_w) / true_w * 100:.1f}% |")
print(f"| **Intercept (b)** | {true_b:.4f} | {ols_clean_b:.4f} | **{ols_outlier_b:.4f}** | {(ols_outlier_b - true_b) / true_b * 100:.1f}% |")
print(f"| **RMSE** | N/A | {mean_squared_error(y, ols_model.predict(X), squared=False):.4f} | **{ols_outlier_rmse:.4f}** | N/A |")

plt.figure(figsize=(9, 6))
plt.scatter(X, y, s=8, alpha=0.5, label='Original Data (2000 points)')
plt.scatter(X_outliers, y_outliers, s=25, color='red', marker='x', label=f'Outliers ({n_outliers} points)')

# Plot the clean model (for context)
plt.plot(X_combined, ols_clean_w * X_combined + ols_clean_b,
         color='green', linestyle='--', linewidth=2, label=f'Clean OLS Fit (w={ols_clean_w:.2f})')

# Plot the outlier-contaminated model
plt.plot(X_combined, ols_outlier_w * X_combined + ols_outlier_b,
         color='blue', linestyle='-', linewidth=2, label=f'Contaminated OLS Fit (w={ols_outlier_w:.2f})')

plt.xlabel("X")
plt.ylabel("y")
plt.title("Effect of Outliers on Ordinary Least Squares (OLS)")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

In [None]:
# baised outliers using only positive offsets
# Assuming rng = np.random.RandomState(42) is defined.
# We'll use the same noise standard deviation (3.0) as the inliers for the random component.
noise_std = 3.0

# Parameters for Outliers
n_outliers = 20

# 1. Generate Outlier Features (X_outlier)
X_outliers = rng.uniform(-15.0, 15.0, size=(n_outliers, 1))

# 2. Generate Outlier Targets (y_outlier)
# Calculate the "correct" y value
y_outliers_base = 3.0 * X_outliers + 5.0

# Generate the intentional offset (+40 / -30)
intentional_offset = rng.choice([40.0, 80], size=(n_outliers, 1))

# Generate random Gaussian noise (epsilon) for the outliers
noise_outlier = rng.normal(0.0, noise_std, size=(n_outliers, 1))

# Final outlier target: y = (Mx + b) + Intentional_Offset + Epsilon_Noise
y_outliers = (y_outliers_base + intentional_offset + noise_outlier).ravel()

# 3. Combine Datasets, concatenating the original and outlier data
X_combined = np.vstack((X, X_outliers))
y_combined = np.hstack((y, y_outliers))

print(f"Original dataset size: {len(X)}")
print(f"Combined dataset size: {len(X_combined)} ({len(X)} original + {n_outliers} outliers)")

# Fit OLS on the combined data
ols_outlier_model = LinearRegression()
ols_outlier_model.fit(X_combined, y_combined)

# Original OLS fit parameters (from previous step for comparison)
# Assuming ols_model, true_w, true_b are available from the previous step
ols_clean_w = ols_model.coef_[0]
ols_clean_b = ols_model.intercept_

# --- Results ---
ols_outlier_w = ols_outlier_model.coef_[0]
ols_outlier_b = ols_outlier_model.intercept_

y_pred_outlier = ols_outlier_model.predict(X_combined)
ols_outlier_rmse = mean_squared_error(y_combined, y_pred_outlier, squared=False)

print("\n## ‚ö†Ô∏è Impact of Outliers on OLS")
print(f"| Metric | True Value | Clean OLS Fit | OLS Fit with {n_outliers} Outliers | Change |")
print(f"| :--- | :--- | :--- | :--- | :--- |")
print(f"| **Slope (w)** | {true_w:.4f} | {ols_clean_w:.4f} | **{ols_outlier_w:.4f}** | {(ols_outlier_w - true_w) / true_w * 100:.1f}% |")
print(f"| **Intercept (b)** | {true_b:.4f} | {ols_clean_b:.4f} | **{ols_outlier_b:.4f}** | {(ols_outlier_b - true_b) / true_b * 100:.1f}% |")
print(f"| **RMSE** | N/A | {mean_squared_error(y, ols_model.predict(X), squared=False):.4f} | **{ols_outlier_rmse:.4f}** | N/A |")

plt.figure(figsize=(9, 6))
plt.scatter(X, y, s=8, alpha=0.5, label='Original Data (2000 points)')
plt.scatter(X_outliers, y_outliers, s=25, color='red', marker='x', label=f'Outliers ({n_outliers} points)')

# Plot the clean model (for context)
plt.plot(X_combined, ols_clean_w * X_combined + ols_clean_b,
         color='green', linestyle='--', linewidth=2, label=f'Clean OLS Fit (w={ols_clean_w:.2f})')

# Plot the outlier-contaminated model
plt.plot(X_combined, ols_outlier_w * X_combined + ols_outlier_b,
         color='blue', linestyle='-', linewidth=2, label=f'Contaminated OLS Fit (w={ols_outlier_w:.2f})')

plt.xlabel("X")
plt.ylabel("y")
plt.title("Effect of Outliers on Ordinary Least Squares (OLS)")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

In [None]:
#Standalone script for Outlier Damage Threshold Analysis (we should test only OLS here, don't run this cell in the notebook, only for testing :D )

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# 1. Data Setup (Fixed Inliers)
n_inliers = 2000
true_w = 3.0
true_b = 5.0
noise_std = 3.0
rng = np.random.RandomState(42)

# Inlier data (fixed)
X_inliers = rng.uniform(-10.0, 10.0, size=(n_inliers, 1))
noise_inliers = rng.normal(0.0, noise_std, size=(n_inliers, 1))
y_inliers = true_w * X_inliers + true_b + noise_inliers
y_inliers = y_inliers.ravel()

# 2. Simulation Parameters
outlier_counts = [20, 50, 100, 150, 200, 300, 400, 500, 1000,1500,2000]
results = []
target_w_deviation = 0.1 # 10% deviation in slope (w = 3.3)

# 3. Simulation Loop (OLS Only)
for N_outlier in outlier_counts:
    # 3a. Generate Outliers
    X_outliers = rng.uniform(-15.0, 15.0, size=(N_outlier, 1))
    y_outliers_base = 3.0 * X_outliers + 5.0
    # Add a large, biased offset
    y_outliers = y_outliers_base.ravel() + rng.choice([300.0, -300.0], size=N_outlier)

    # 3b. Combine Data
    X_combined = np.vstack((X_inliers, X_outliers))
    y_combined = np.hstack((y_inliers, y_outliers))

    # 3c. Fit OLS (LinearRegression)
    ols_model = LinearRegression()
    ols_model.fit(X_combined, y_combined)
    ols_w_fit = ols_model.coef_[0]

    # 3d. Store results
    results.append({
        'N_outlier': N_outlier,
        'OLS_w': ols_w_fit
    })

# 4. Plotting Results
n_outlier_array = np.array([r['N_outlier'] for r in results])
ols_w_array = np.array([r['OLS_w'] for r in results])

plt.figure(figsize=(9, 6))

# Plot OLS results
plt.plot(n_outlier_array, ols_w_array, marker='o', linestyle='-', color='blue', label='OLS Fitted Slope')

# Plot True Value and Deviation Threshold
plt.axhline(y=true_w, color='green', linestyle='-', linewidth=2, label=f'True Slope (w={true_w})')
plt.axhline(y=true_w * (1 + target_w_deviation), color='red', linestyle=':', label=f'10% Deviation Threshold (w={true_w * (1 + target_w_deviation):.2f})')

plt.xlabel("Number of Outliers Added")
plt.ylabel("Fitted OLS Slope Coefficient (w)")
plt.title("OLS Slope Deviation vs. Outlier Count (N_inliers = 2000)")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

#5. Summary Table
print("\n## üéØ OLS Threshold Analysis Summary")
print("| N_Outlier | OLS Slope (w) | % Change from True |")
print("| :--- | :--- | :--- |")
for r in results:
    ols_w_change_percent = (r['OLS_w'] - true_w) / true_w * 100
    print(f"| {r['N_outlier']:<9} | {r['OLS_w']:.4f} | {ols_w_change_percent:.2f}% |")

Fitting model with RANSAC alogrithm

In [None]:
from sklearn.linear_model import RANSACRegressor

# Assuming X_combined, y_combined, ols_clean_w, ols_clean_b, 
# ols_outlier_w, ols_outlier_b are available from previous steps.

# --- Fit RANSAC Model ---
# We use the contamination level (2 * noise_std = 6.0) as the threshold for an inlier, but we set a bit higher to account for noise.
ransac_model = RANSACRegressor(
    min_samples=2,
    residual_threshold=8.0,
    random_state=42
)
ransac_model.fit(X_combined, y_combined)

# Extract RANSAC coefficients
ransac_w = ransac_model.estimator_.coef_[0]
ransac_b = ransac_model.estimator_.intercept_

# Determine inliers/outliers for plotting
inlier_mask = ransac_model.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)

# --- Comparison Results ---
print("## üìä RANSAC vs. OLS Fit Comparison")
print(f"| Metric | True Value | Contaminated OLS | RANSAC Robust Fit |")
print(f"| :--- | :--- | :--- | :--- |")
print(f"| **Slope (w)** | {true_w:.4f} | {ols_outlier_w:.4f} | **{ransac_w:.4f}** |")
print(f"| **Intercept (b)** | {true_b:.4f} | {ols_outlier_b:.4f} | **{ransac_b:.4f}** |")

# --- Visualization ---
plt.figure(figsize=(10, 6))

# Plot the data
# plt.scatter(X_combined[inlier_mask], y_combined[inlier_mask], s=10, alpha=0.5, color='gray', label='RANSAC Inliers')
# plt.scatter(X_combined[outlier_mask], y_combined[outlier_mask], s=40, color='red', marker='x', label='RANSAC Outliers Ignored')

# Define x-range for plotting lines
X_plot = np.linspace(X_combined.min(), X_combined.max(), 100).reshape(-1, 1)

# Plot the Contaminated OLS Fit (Sensitive Model)
y_ols_contam = ols_outlier_w * X_plot.ravel() + ols_outlier_b
plt.plot(X_plot, y_ols_contam, color='blue', linestyle='-', linewidth=2, label=f'OLS Fit (Sensitive, w={ols_outlier_w:.2f})')

# Plot the RANSAC Fit (Robust Model)
y_ransac = ransac_w * X_plot.ravel() + ransac_b
plt.plot(X_plot, y_ransac, color='orange', linestyle='-', linewidth=3, label=f'RANSAC Fit (Robust, w={ransac_w:.2f})')

# Plot the ideal clean fit for reference
y_clean = ols_clean_w * X_plot.ravel() + ols_clean_b
plt.plot(X_plot, y_clean, color='green', linestyle='--', linewidth=2, label=f'Clean OLS Fit (Ideal, w={ols_clean_w:.2f})')

plt.ylim(np.min(y_combined) - 50, np.max(y_combined) + 50)
plt.xlabel("X")
plt.ylabel("y")
plt.title("RANSAC vs. OLS: Robustness to Outliers")
plt.legend()
plt.grid(alpha=0.3)
plt.show()