In [None]:
import argparse
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
import textwrap
import sys

def detect_column(cols, key):
    """Case-insensitive lookup: return actual column name for key if found in cols"""
    lower_map = {c.lower(): c for c in cols}
    if key.lower() in lower_map:
        return lower_map[key.lower()]
    return None

def fit_models(X, y, ridge_alpha=1.0, lasso_alpha=1.0):
    lin = LinearRegression().fit(X, y)
    ridge = Ridge(alpha=ridge_alpha).fit(X, y)
    lasso = Lasso(alpha=lasso_alpha, max_iter=20000).fit(X, y)
    return lin, ridge, lasso

def model_eq_text(model, feature_names):
    intercept = float(model.intercept_)
    coefs = [float(c) for c in np.ravel(model.coef_)]
    # Build equation string nicely
    terms = []
    for coef, fname in zip(coefs, feature_names):
        terms.append(f"({coef:.6f}*{fname})")
    rhs = " + ".join(terms)
    eq = f"ŷ = {intercept:.6f} + {rhs}"
    return eq, intercept, coefs

def manual_prediction_row(intercept, coefs, feature_names, row_values):
    """Return string showing arithmetic and predicted value"""
    # row_values: list of values in same order as feature_names
    parts = []
    pred = intercept
    parts.append(f"{intercept:.6f}")
    for coef, fname, val in zip(coefs, feature_names, row_values):
        prod = coef * val
        pred += prod
        parts.append(f"({coef:.6f} * {val:.6f} = {prod:.6f})")
    arithmetic = " + ".join(parts)
    return arithmetic, pred

def save_summary(path, lines):
    with open(path, "w") as f:
        f.write("\n".join(lines))
    print(f"\nSummary saved to: {path}")

def main():
    p = argparse.ArgumentParser(description="Linear / Ridge / Lasso regression demo (Boston housing)")
    p.add_argument("--csv", default="BostonHousing.csv", help="Path to CSV file (default BostonHousing.csv)")
    p.add_argument("--feature", default="rm", help="Feature column name (default rm). For multiple features, provide comma-separated names.")
    p.add_argument("--target", default="medv", help="Target column name (default medv)")
    p.add_argument("--n_manual", type=int, default=10, help="Number of rows to show manual arithmetic for (default 10)")
    p.add_argument("--alpha", type=float, default=1.0, help="Regularization alpha for Ridge & Lasso (default 1.0)")
    p.add_argument("--save", default="results_summary.txt", help="Summary output file (default results_summary.txt)")
    # Modify this line to parse an empty list of arguments when run in a notebook
    # This prevents argparse from trying to parse internal Jupyter/Colab arguments.
    args = p.parse_args([])

    # Load CSV
    try:
        df = pd.read_csv("/content/BostonHousing.csv")
    except FileNotFoundError:
        print(f"ERROR: CSV file not found at '{args.csv}'. Provide correct path with --csv")
        sys.exit(1)

    # Determine feature(s) and target column names (case-insensitive)
    feature_names_requested = [f.strip() for f in args.feature.split(",") if f.strip()]
    resolved_feature_names = []
    for f in feature_names_requested:
        detected = detect_column(df.columns, f)
        if detected is None:
            print(f"ERROR: feature column '{f}' not found in CSV columns: {list(df.columns)}")
            sys.exit(1)
        resolved_feature_names.append(detected)

    target_detected = detect_column(df.columns, args.target)
    if target_detected is None:
        # fallback to last column
        target_detected = df.columns[-1]
        print(f"WARNING: target '{args.target}' not found; using last column '{target_detected}' as target.")

    feature_names = resolved_feature_names
    target_name = target_detected

    # Prepare data
    use_cols = feature_names + [target_name]
    data = df[use_cols].dropna().copy()
    if data.shape[0] == 0:
        print("No rows available after dropping NaNs. Exiting.")
        sys.exit(1)

    X = data[feature_names].values
    y = data[target_name].values

    # Fit models
    lin, ridge, lasso = fit_models(X, y, ridge_alpha=args.alpha, lasso_alpha=args.alpha)

    # Equations
    lin_eq, lin_inter, lin_coefs = model_eq_text(lin, feature_names)
    ridge_eq, ridge_inter, ridge_coefs = model_eq_text(ridge, feature_names)
    lasso_eq, lasso_inter, lasso_coefs = model_eq_text(lasso, feature_names)

    # Print header
    print("\n--- Regression equations ---\n")
    print("Linear Regression :")
    print(lin_eq)
    print("\nRidge Regression (alpha={:.6f}):".format(args.alpha))
    print(ridge_eq)
    print("\nLasso Regression (alpha={:.6f}):".format(args.alpha))
    print(lasso_eq)

    # Manual predictions (first n_manual rows)
    n_manual = min(args.n_manual, len(data))
    sample = data.iloc[:n_manual].reset_index(drop=True)

    print(f"\n--- Manual prediction arithmetic for first {n_manual} rows ---\n")
    lines_out = []
    lines_out.append("Regression Equations:")
    lines_out.append("Linear: " + lin_eq)
    lines_out.append("Ridge : " + ridge_eq)
    lines_out.append("Lasso : " + lasso_eq)
    lines_out.append("")
    lines_out.append(f"Manual predictions for first {n_manual} rows (feature(s): {', '.join(feature_names)}; target: {target_name})")
    lines_out.append("")

    for i in range(n_manual):
        row = sample.loc[i, feature_names]
        actual = float(sample.loc[i, target_name])
        row_vals = [float(row[f]) for f in feature_names]
        print(f"Row {i+1}:")
        vals_str = ", ".join([f"{fname}={val:.6f}" for fname, val in zip(feature_names, row_vals)])
        print("  " + vals_str)
        lines_out.append(f"Row {i+1}: {vals_str}  | actual {target_name} = {actual:.6f}")

        # Linear
        arith_lin, pred_lin = manual_prediction_row(lin_inter, lin_coefs, feature_names, row_vals)
        print(f"  Linear: {arith_lin} = {pred_lin:.6f}  (actual {actual:.6f})")
        lines_out.append(f"  Linear prediction: {arith_lin} = {pred_lin:.6f}  (actual {actual:.6f})")

        # Ridge
        arith_ridge, pred_ridge = manual_prediction_row(ridge_inter, ridge_coefs, feature_names, row_vals)
        print(f"  Ridge : {arith_ridge} = {pred_ridge:.6f}  (actual {actual:.6f})")
        lines_out.append(f"  Ridge prediction: {arith_ridge} = {pred_ridge:.6f}  (actual {actual:.6f})")

        # Lasso
        arith_lasso, pred_lasso = manual_prediction_row(lasso_inter, lasso_coefs, feature_names, row_vals)
        print(f"  Lasso : {arith_lasso} = {pred_lasso:.6f}  (actual {actual:.6f})")
        lines_out.append(f"  Lasso prediction: {arith_lasso} = {pred_lasso:.6f}  (actual {actual:.6f})")

        # squared errors (optional inclusion)
        se_lin = (actual - pred_lin) ** 2
        se_ridge = (actual - pred_ridge) ** 2
        se_lasso = (actual - pred_lasso) ** 2
        print(f"    Squared errors: Linear={se_lin:.6f}, Ridge={se_ridge:.6f}, Lasso={se_lasso:.6f}\n")
        lines_out.append(f"    Squared errors: Linear={se_lin:.6f}, Ridge={se_ridge:.6f}, Lasso={se_lasso:.6f}\n")

    # Compute MSE on the entire dataset
    preds_lin = lin.predict(X)
    preds_ridge = ridge.predict(X)
    preds_lasso = lasso.predict(X)

    mse_lin = mean_squared_error(y, preds_lin)
    mse_ridge = mean_squared_error(y, preds_ridge)
    mse_lasso = mean_squared_error(y, preds_lasso)

    print("\n--- Mean Squared Error (MSE) on entire dataset ---\n")
    print(f"Linear MSE : {mse_lin:.6f}")
    print(f"Ridge  MSE : {mse_ridge:.6f}  (alpha={args.alpha})")
    print(f"Lasso  MSE : {mse_lasso:.6f}  (alpha={args.alpha})")

    lines_out.append("MSE (entire dataset):")
    lines_out.append(f"Linear: {mse_lin:.6f}")
    lines_out.append(f"Ridge : {mse_ridge:.6f} (alpha={args.alpha})")
    lines_out.append(f"Lasso : {mse_lasso:.6f} (alpha={args.alpha})")

    # Best model by MSE
    mse_dict = {"Linear": mse_lin, "Ridge": mse_ridge, "Lasso": mse_lasso}
    best = min(mse_dict, key=mse_dict.get)
    print(f"\nBest model by MSE: {best} (MSE = {mse_dict[best]:.6f})")
    lines_out.append("")
    lines_out.append(f"Best model by MSE: {best} (MSE = {mse_dict[best]:.6f})")
    lines_out.append("")
    lines_out.append("Notes for observation book:")
    lines_out.append(" - Record the regression equations (intercept and coefficients).")
    lines_out.append(" - For each sample row, write the arithmetic: ŷ = intercept + sum(coef_i * feature_i).")
    lines_out.append(" - Show one or two squared-error calculations then compute MSE using the formula:")
    lines_out.append("     MSE = (1/n) * sum_{i=1..n} (y_i - ŷ_i)^2")
    lines_out.append(" - Conclude which model is best by lowest MSE. You may vary alpha and repeat to observe changes.")

    # Save summary to file for upload/printing
    save_summary(args.save, lines_out)

if __name__ == "__main__":
    main()


--- Regression equations ---

Linear Regression :
ŷ = -34.684058 + (9.109202*rm)

Ridge Regression (alpha=1.000000):
ŷ = -34.455009 + (9.072755*rm)

Lasso Regression (alpha=1.000000):
ŷ = -22.035919 + (7.096559*rm)

--- Manual prediction arithmetic for first 10 rows ---

Row 1:
  rm=6.575000
  Linear: -34.684058 + (9.109202 * 6.575000 = 59.893004) = 25.208946  (actual 24.000000)
  Ridge : -34.455009 + (9.072755 * 6.575000 = 59.653361) = 25.198352  (actual 24.000000)
  Lasso : -22.035919 + (7.096559 * 6.575000 = 46.659873) = 24.623954  (actual 24.000000)
    Squared errors: Linear=1.461551, Ridge=1.436048, Lasso=0.389318

Row 2:
  rm=6.421000
  Linear: -34.684058 + (9.109202 * 6.421000 = 58.490187) = 23.806129  (actual 21.600000)
  Ridge : -34.455009 + (9.072755 * 6.421000 = 58.256157) = 23.801148  (actual 21.600000)
  Lasso : -22.035919 + (7.096559 * 6.421000 = 45.567003) = 23.531084  (actual 21.600000)
    Squared errors: Linear=4.867005, Ridge=4.845053, Lasso=3.729085

Row 3:
  r