In [None]:
# Step 1: Load and save raw data
from src.make_dataset import save_raw_data
save_raw_data()

In [None]:
# Step 2: Preprocess
from src.preprocess import load_and_preprocess_data
X_train, X_test, y_train, y_test = load_and_preprocess_data()

In [None]:
# Step 2b: Corrupt training data (simulate outlier)
import numpy as np
X_train_corrupted = X_train.copy()
X_train_corrupted[0] += 10  # add large noise to first training sample


In [None]:
# Step 3: Train and compare models on original and corrupted data
from src.regression_models import run_models
results_original = run_models(X_train, y_train)
results_corrupted = run_models(X_train_corrupted, y_train)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.bar(results_original.keys(), results_original.values())
plt.title("Model Comparison (Original Data)")
plt.ylabel("MSE (5-fold CV)")
plt.xticks(rotation=45)
plt.grid(True)

plt.subplot(1,2,2)
plt.bar(results_corrupted.keys(), results_corrupted.values())
plt.title("Model Comparison (Corrupted Data)")
plt.xticks(rotation=45)
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Step 4: Final model evaluation on test set for all models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from src.evaluation import evaluate_model

models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "LASSO": Lasso(alpha=0.1),
    "Elastic Net": ElasticNet(alpha=0.1, l1_ratio=0.5),
}

for name, model in models.items():
    model.fit(X_train, y_train)  # train on full original training set
    print(f"Evaluating {name} on test set:")
    evaluate_model(model, X_test, y_test, title=name)
