In [2]:
# Cell 1 — Imports
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import matplotlib.pyplot as plt



In [3]:
# Cell 2 — Load manufacturing dataset
# If you downloaded it earlier, it should be here in this environment:
csv_path = "manufacturing_yield_regression.csv"   # or "/mnt/data/manufacturing_yield_regression.csv"
df = pd.read_csv(csv_path)

df.head()


Unnamed: 0,temp,pressure,time,flow,defects,tool_age,yield
0,463.7,109.0,40.6,48.2,0.474,172,91.6
1,447.5,108.8,43.1,48.4,1.493,145,89.86
2,469.0,107.2,34.8,47.6,0.974,77,84.25
3,471.3,111.7,39.0,46.5,0.763,75,86.29
4,436.6,110.9,42.9,46.4,0.893,207,79.63


In [4]:
# Cell 3 — Features/target split
target = "yield"          # adjust if your column name differs
X = df.drop(columns=[target])
y = df[target]

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (5000, 6)
y shape: (5000,)


In [5]:
# Cell 4 — Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)


In [6]:
# Cell 5 — Helper: evaluation
def eval_regression(name, model, X_tr, y_tr, X_te, y_te):
    model.fit(X_tr, y_tr)
    pred_tr = model.predict(X_tr)
    pred_te = model.predict(X_te)

    out = {
        "Model": name,
        "Train_RMSE": float(np.sqrt(mean_squared_error(y_tr, pred_tr))),
        "Test_RMSE": float(np.sqrt(mean_squared_error(y_te, pred_te))),
        "Train_MAE": float(mean_absolute_error(y_tr, pred_tr)),
        "Test_MAE": float(mean_absolute_error(y_te, pred_te)),
        "Train_R2": float(r2_score(y_tr, pred_tr)),
        "Test_R2": float(r2_score(y_te, pred_te)),
    }
    return out, pred_te


In [14]:
# Cell 6 — Baseline: Multiple Linear Regression (OLS)
ols = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])
ols_metrics, ols_pred = eval_regression("OLS (LinearRegression)", ols, X_train, y_train, X_test, y_test)
ols_metrics


{'Model': 'OLS (LinearRegression)',
 'Train_RMSE': 7.144330190476522,
 'Test_RMSE': 7.73896756253621,
 'Train_MAE': 5.033012532286505,
 'Test_MAE': 5.261869413611465,
 'Train_R2': 0.264583245768119,
 'Test_R2': 0.28186777180756606}

In [15]:
# Cell 7 — Ridge (L2): use scaling + cross-validated alpha
ridge_alphas = np.logspace(-4, 4, 50)

ridge = Pipeline([
    ("scaler", StandardScaler()),
    ("model", RidgeCV(alphas=ridge_alphas, cv=5))
])

ridge_metrics, ridge_pred = eval_regression("RidgeCV (L2)", ridge, X_train, y_train, X_test, y_test)
ridge_metrics


{'Model': 'RidgeCV (L2)',
 'Train_RMSE': 7.144563542246639,
 'Test_RMSE': 7.743237618819566,
 'Train_MAE': 5.033567341147524,
 'Test_MAE': 5.263068713695368,
 'Train_R2': 0.26453520386652196,
 'Test_R2': 0.281075079246042}

In [16]:
# Cell 8 — Lasso (L1): use scaling + cross-validated alpha
# Increase max_iter to avoid convergence issues.
lasso = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LassoCV(alphas=100, cv=5, random_state=42, max_iter=20000))
])

lasso_metrics, lasso_pred = eval_regression("LassoCV (L1)", lasso, X_train, y_train, X_test, y_test)
lasso_metrics


{'Model': 'LassoCV (L1)',
 'Train_RMSE': 7.146240607577709,
 'Test_RMSE': 7.742873203555163,
 'Train_MAE': 5.034412643423465,
 'Test_MAE': 5.260047108296675,
 'Train_R2': 0.264189887524369,
 'Test_R2': 0.2811427463029261}

In [17]:
# Cell 9 — Compare metrics side-by-side
results = pd.DataFrame([ols_metrics, ridge_metrics, lasso_metrics])

# Pretty formatting (avoid .round() on python floats)
for col in ["Train_RMSE","Test_RMSE","Train_MAE","Test_MAE","Train_R2","Test_R2"]:
    results[col] = results[col].map(lambda v: round(v, 4))

results



Unnamed: 0,Model,Train_RMSE,Test_RMSE,Train_MAE,Test_MAE,Train_R2,Test_R2
0,OLS (LinearRegression),7.1443,7.739,5.033,5.2619,0.2646,0.2819
1,RidgeCV (L2),7.1446,7.7432,5.0336,5.2631,0.2645,0.2811
2,LassoCV (L1),7.1462,7.7429,5.0344,5.26,0.2642,0.2811


In [18]:
# Cell 10 — Report selected alphas (hyperparameters)
ridge_alpha = ridge.named_steps["model"].alpha_
lasso_alpha = lasso.named_steps["model"].alpha_

print("Ridge selected alpha:", ridge_alpha)
print("Lasso selected alpha:", lasso_alpha)


Ridge selected alpha: 51.79474679231202
Lasso selected alpha: 0.07415581516572292


In [20]:
# Cell 11 — Coefficients comparison (scaled-model coefficients)
# OLS coefficients are in original units (no scaling).
# Ridge/Lasso coefficients here are w.r.t. standardized features due to the StandardScaler.


coef_df = pd.DataFrame({
    "feature": X.columns,
    "OLS_coef": ols.named_steps["model"].coef_,
    "Ridge_coef (stdX)": ridge.named_steps["model"].coef_,
    "Lasso_coef (stdX)": lasso.named_steps["model"].coef_,
})

coef_df["abs_Lasso"] = coef_df["Lasso_coef (stdX)"].abs()
coef_df = coef_df.sort_values("abs_Lasso", ascending=False).drop(columns=["abs_Lasso"])

coef_df.head(15)


Unnamed: 0,feature,OLS_coef,Ridge_coef (stdX),Lasso_coef (stdX)
0,temp,-3.677488,-3.627495,-3.603702
4,defects,-1.57779,-1.556756,-1.506202
5,tool_age,-0.994145,-0.980654,-0.921511
2,time,0.780857,0.770973,0.708008
3,flow,0.734273,0.725415,0.665098
1,pressure,-0.024965,-0.023331,-0.0
