In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import json
import joblib
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# Create results directory using relative paths
PROJECT_DIR = Path('../')
RESULTS_DIR = PROJECT_DIR / 'results'
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

print("Setup complete!")

Setup complete!


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("arunjangir245/boston-housing-dataset")

print("Path to dataset files:", path)

KaggleApiHTTPError: 403 Client Error.

You don't have permission to access resource at URL: https://api.kaggle.com/v1/datasets.DatasetApiService/GetDataset. Please make sure you are authenticated if you are trying to access a private resource or a resource requiring consent.

In [None]:
# Load the dataset using the path from kagglehub
df = pd.read_csv(path + '/BostonHousing.csv')

# Handle missing values - fill all numeric columns with median
df = df.fillna(df.median())

# Verify no missing values
print(f"Missing values after cleaning: {df.isnull().sum().sum()}")

# Define features and target
X = df.drop('medv', axis=1)
y = df['medv']

# Train/test split (70/30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Save split data
X_train.to_csv(RESULTS_DIR / 'X_train.csv', index=False)
X_test.to_csv(RESULTS_DIR / 'X_test.csv', index=False)
y_train.to_csv(RESULTS_DIR / 'y_train.csv', index=False)
y_test.to_csv(RESULTS_DIR / 'y_test.csv', index=False)
print("Data saved to results/")

Missing values after cleaning: 0
Training set: 354 samples
Test set: 152 samples
Data saved to results/


In [None]:
from typing import Any, Dict, Tuple

def evaluate_model(
    model: Any,
    X_train: Any,
    X_test: Any,
    y_train: Any,
    y_test: Any,
    model_name: str
) -> Tuple[Dict[str, Any], Any]:
    """Evaluate model and return metrics"""
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    metrics: Dict[str, Any] = {
        'model_name': model_name,
        'train_mse': mean_squared_error(y_train, y_train_pred),
        'test_mse': mean_squared_error(y_test, y_test_pred),
        'train_rmse': np.sqrt(mean_squared_error(y_train, y_train_pred)),
        'test_rmse': np.sqrt(mean_squared_error(y_test, y_test_pred)),
        'train_mae': mean_absolute_error(y_train, y_train_pred),
        'test_mae': mean_absolute_error(y_test, y_test_pred),
        'train_r2': r2_score(y_train, y_train_pred),
        'test_r2': r2_score(y_test, y_test_pred),
    }
    
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    metrics['cv_r2_mean'] = cv_scores.mean()
    metrics['cv_r2_std'] = cv_scores.std()
    
    return metrics, y_test_pred

def print_metrics(metrics: Dict[str, Any]) -> None:
    """Print model metrics"""
    print(f"\n{'='*50}")
    print(f"Model: {metrics['model_name']}")
    print(f"{'='*50}")
    print(f"Train MSE: {metrics['train_mse']:.4f} | Test MSE: {metrics['test_mse']:.4f}")
    print(f"Train RMSE: {metrics['train_rmse']:.4f} | Test RMSE: {metrics['test_rmse']:.4f}")
    print(f"Train MAE: {metrics['train_mae']:.4f} | Test MAE: {metrics['test_mae']:.4f}")
    print(f"Train R¬≤: {metrics['train_r2']:.4f} | Test R¬≤: {metrics['test_r2']:.4f}")
    print(f"CV R¬≤ (mean¬±std): {metrics['cv_r2_mean']:.4f} ¬± {metrics['cv_r2_std']:.4f}")

    # Overfitting/Underfitting analysis
    diff = metrics['train_r2'] - metrics['test_r2']
    if diff > 0.1:
        print(f"‚ö†Ô∏è  Overfitting detected (train-test R¬≤ gap: {diff:.4f})")
    elif metrics['train_r2'] < 0.5 and metrics['test_r2'] < 0.5:
        print(f"‚ö†Ô∏è  Underfitting detected (low R¬≤ on both sets)")
    else:
        print(f"‚úÖ Good fit")

print("Evaluation functions defined!")

Evaluation functions defined!


In [None]:
# 1. UNIVARIATE LINEAR REGRESSION
# Using only 'rm' (rooms) - strongest correlation with target
print("="*60)
print("1. UNIVARIATE LINEAR REGRESSION (rm ‚Üí medv)")
print("="*60)

X_train_uni = X_train[['rm']]
X_test_uni = X_test[['rm']]

lr_uni = LinearRegression()
lr_uni.fit(X_train_uni, y_train)

metrics_uni, pred_uni = evaluate_model(lr_uni, X_train_uni, X_test_uni, y_train, y_test, "Linear Regression (Univariate)")
print_metrics(metrics_uni)

# Save model and predictions
joblib.dump(lr_uni, str(RESULTS_DIR) + '/linear_univariate.joblib')
np.save(str(RESULTS_DIR) + '/pred_linear_univariate.npy', pred_uni)

# Save metrics
with open(str(RESULTS_DIR) + '/metrics_linear_univariate.json', 'w') as f:
    json.dump(metrics_uni, f, indent=2)

print("\n‚úÖ Model and predictions saved!")

1. UNIVARIATE LINEAR REGRESSION (rm ‚Üí medv)

Model: Linear Regression (Univariate)
Train MSE: 44.9427 | Test MSE: 40.3866
Train RMSE: 6.7039 | Test RMSE: 6.3550
Train MAE: 4.5033 | Test MAE: 4.3207
Train R¬≤: 0.4887 | Test R¬≤: 0.4580
CV R¬≤ (mean¬±std): 0.4524 ¬± 0.1773
‚ö†Ô∏è  Underfitting detected (low R¬≤ on both sets)

‚úÖ Model and predictions saved!


In [None]:
# 2. MULTIVARIATE LINEAR REGRESSION
# Using all features
print("="*60)
print("2. MULTIVARIATE LINEAR REGRESSION (all features)")
print("="*60)

lr_multi = LinearRegression()
lr_multi.fit(X_train, y_train)

metrics_multi, pred_multi = evaluate_model(lr_multi, X_train, X_test, y_train, y_test, "Linear Regression (Multivariate)")
print_metrics(metrics_multi)

# Feature importance (coefficients)
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'coefficient': lr_multi.coef_
}).sort_values('coefficient', key=abs, ascending=False)

print("\nTop 5 Most Important Features:")
print(feature_importance.head())

# Save model and predictions
joblib.dump(lr_multi, str(RESULTS_DIR) + '/linear_multivariate.joblib')
np.save(str(RESULTS_DIR) + '/pred_linear_multivariate.npy', pred_multi)

# Save metrics
with open(str(RESULTS_DIR) + '/metrics_linear_multivariate.json', 'w') as f:
    json.dump(metrics_multi, f, indent=2)

print("\n‚úÖ Model and predictions saved!")

2. MULTIVARIATE LINEAR REGRESSION (all features)

Model: Linear Regression (Multivariate)
Train MSE: 22.5704 | Test MSE: 21.6188
Train RMSE: 4.7508 | Test RMSE: 4.6496
Train MAE: 3.3590 | Test MAE: 3.1761
Train R¬≤: 0.7432 | Test R¬≤: 0.7099
CV R¬≤ (mean¬±std): 0.6880 ¬± 0.0923
‚úÖ Good fit

Top 5 Most Important Features:
    feature  coefficient
4       nox   -15.423388
5        rm     4.056626
3      chas     3.121412
7       dis    -1.379212
10  ptratio    -0.912924

‚úÖ Model and predictions saved!


In [None]:
# 3. FEATURE SELECTION - Using top correlated features
print("="*60)
print("3. FEATURE SELECTION - Top Correlated Features")
print("="*60)

# Select top features based on correlation with target
correlations = df.corr()['medv'].drop('medv').abs().sort_values(ascending=False)
top_features = correlations.head(6).index.tolist()
print(f"Selected features: {top_features}")

X_train_fs = X_train[top_features]
X_test_fs = X_test[top_features]

lr_fs = LinearRegression()
lr_fs.fit(X_train_fs, y_train)

metrics_fs, pred_fs = evaluate_model(lr_fs, X_train_fs, X_test_fs, y_train, y_test, "Linear Regression (Feature Selection)")
print_metrics(metrics_fs)

# Save model and predictions
joblib.dump(lr_fs, str(RESULTS_DIR) + '/linear_feature_selection.joblib')
np.save(str(RESULTS_DIR) + '/pred_linear_feature_selection.npy', pred_fs)

with open(str(RESULTS_DIR) + '/metrics_linear_feature_selection.json', 'w') as f:
    json.dump(metrics_fs, f, indent=2)

print("\n‚úÖ Model and predictions saved!")

3. FEATURE SELECTION - Top Correlated Features
Selected features: ['lstat', 'rm', 'ptratio', 'indus', 'tax', 'nox']

Model: Linear Regression (Feature Selection)
Train MSE: 27.4869 | Test MSE: 26.0001
Train RMSE: 5.2428 | Test RMSE: 5.0990
Train MAE: 3.6675 | Test MAE: 3.5702
Train R¬≤: 0.6873 | Test R¬≤: 0.6511
CV R¬≤ (mean¬±std): 0.6512 ¬± 0.0902
‚úÖ Good fit

‚úÖ Model and predictions saved!


In [None]:
# 4. POLYNOMIAL REGRESSION
print("="*60)
print("4. POLYNOMIAL REGRESSION")
print("="*60)

results_poly = {}

for degree in [2, 3]:
    print(f"\n--- Degree {degree} ---")
    
    # Create polynomial features
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_train_poly = poly.fit_transform(X_train_uni)
    X_test_poly = poly.transform(X_test_uni)
    
    lr_poly = LinearRegression()
    lr_poly.fit(X_train_poly, y_train)
    
    metrics_poly, pred_poly = evaluate_model(
        lr_poly, X_train_poly, X_test_poly, y_train, y_test, 
        f"Polynomial Regression (degree={degree})"
    )
    print_metrics(metrics_poly)
    
    results_poly[degree] = {
        'model': lr_poly,
        'metrics': metrics_poly,
        'predictions': pred_poly,
        'poly': poly
    }
    
    # Save model
    joblib.dump(lr_poly, f'{RESULTS_DIR}/polynomial_degree{degree}.joblib')
    joblib.dump(poly, f'{RESULTS_DIR}/polynomial_transformer_degree{degree}.joblib')
    np.save(f'{RESULTS_DIR}/pred_polynomial_degree{degree}.npy', pred_poly)
    
    with open(f'{RESULTS_DIR}/metrics_polynomial_degree{degree}.json', 'w') as f:
        json.dump(metrics_poly, f, indent=2)

# Compare degrees
print("\n" + "="*60)
print("POLYNOMIAL DEGREE COMPARISON")
print("="*60)
for degree, data in results_poly.items():
    m = data['metrics']
    print(f"Degree {degree}: Train R¬≤={m['train_r2']:.4f}, Test R¬≤={m['test_r2']:.4f}, CV R¬≤={m['cv_r2_mean']:.4f}")

print("\n‚úÖ Polynomial models saved!")

4. POLYNOMIAL REGRESSION

--- Degree 2 ---

Model: Polynomial Regression (degree=2)
Train MSE: 40.7690 | Test MSE: 32.2518
Train RMSE: 6.3851 | Test RMSE: 5.6791
Train MAE: 4.2943 | Test MAE: 4.0268
Train R¬≤: 0.5362 | Test R¬≤: 0.5672
CV R¬≤ (mean¬±std): 0.4829 ¬± 0.2243
‚úÖ Good fit

--- Degree 3 ---

Model: Polynomial Regression (degree=3)
Train MSE: 39.6337 | Test MSE: 31.1067
Train RMSE: 6.2955 | Test RMSE: 5.5773
Train MAE: 4.2922 | Test MAE: 3.9042
Train R¬≤: 0.5491 | Test R¬≤: 0.5825
CV R¬≤ (mean¬±std): 0.4908 ¬± 0.2045
‚úÖ Good fit

POLYNOMIAL DEGREE COMPARISON
Degree 2: Train R¬≤=0.5362, Test R¬≤=0.5672, CV R¬≤=0.4829
Degree 3: Train R¬≤=0.5491, Test R¬≤=0.5825, CV R¬≤=0.4908

‚úÖ Polynomial models saved!


In [None]:
# 5. GRADIENT DESCENT (SGDRegressor)
print("="*60)
print("5. GRADIENT DESCENT OPTIMIZATION (SGDRegressor)")
print("="*60)

# Scale features for gradient descent
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SGDRegressor with different configurations
sgd_configs = [
    {'loss': 'squared_error', 'learning_rate': 'constant', 'eta0': 0.01, 'name': 'SGD (constant)'},
    {'loss': 'squared_error', 'learning_rate': 'adaptive', 'eta0': 0.01, 'name': 'SGD (adaptive)'},
]

results_sgd = {}

for config in sgd_configs:
    print(f"\n--- {config['name']} ---")
    
    sgd = SGDRegressor(
        loss=config['loss'],
        learning_rate=config['learning_rate'],
        eta0=config['eta0'],
        max_iter=1000,
        random_state=42,
        early_stopping=True,
        validation_fraction=0.1
    )
    
    sgd.fit(X_train_scaled, y_train)
    
    metrics_sgd, pred_sgd = evaluate_model(
        sgd, X_train_scaled, X_test_scaled, y_train, y_test,
        config['name']
    )
    print_metrics(metrics_sgd)
    
    results_sgd[config['name']] = {
        'model': sgd,
        'metrics': metrics_sgd,
        'predictions': pred_sgd
    }
    
    # Save model and scaler
    safe_name = config['name'].replace(' ', '_').replace('(', '').replace(')', '')
    joblib.dump(sgd, f'{RESULTS_DIR}/{safe_name}.joblib')
    np.save(f'{RESULTS_DIR}/pred_{safe_name}.npy', pred_sgd)
    
    with open(f'{RESULTS_DIR}/metrics_{safe_name}.json', 'w') as f:
        json.dump(metrics_sgd, f, indent=2)

# Save scaler
joblib.dump(scaler, f'{RESULTS_DIR}/scaler.joblib')

print("\n‚úÖ Gradient descent models saved!")

5. GRADIENT DESCENT OPTIMIZATION (SGDRegressor)

--- SGD (constant) ---

Model: SGD (constant)
Train MSE: 23.3203 | Test MSE: 22.7980
Train RMSE: 4.8291 | Test RMSE: 4.7747
Train MAE: 3.4790 | Test MAE: 3.2964
Train R¬≤: 0.7347 | Test R¬≤: 0.6940
CV R¬≤ (mean¬±std): 0.6657 ¬± 0.0882
‚úÖ Good fit

--- SGD (adaptive) ---

Model: SGD (adaptive)
Train MSE: 22.6660 | Test MSE: 21.5913
Train RMSE: 4.7609 | Test RMSE: 4.6466
Train MAE: 3.3544 | Test MAE: 3.1626
Train R¬≤: 0.7421 | Test R¬≤: 0.7102
CV R¬≤ (mean¬±std): 0.6901 ¬± 0.0900
‚úÖ Good fit

‚úÖ Gradient descent models saved!


In [None]:
# 6. CROSS-VALIDATION ANALYSIS
print("="*60)
print("6. CROSS-VALIDATION ANALYSIS (5-Fold)")
print("="*60)

models_to_cv = {
    'Linear (Uni)': (lr_uni, X_train_uni),
    'Linear (Multi)': (lr_multi, X_train),
    'Linear (FS)': (lr_fs, X_train_fs),
}

cv_results = {}

for name, (model, X_data) in models_to_cv.items():
    # R¬≤ cross-validation
    cv_r2 = cross_val_score(model, X_data, y_train, cv=5, scoring='r2')
    
    # Negative MSE cross-validation
    cv_mse = cross_val_score(model, X_data, y_train, cv=5, scoring='neg_mean_squared_error')
    
    cv_results[name] = {
        'r2_mean': cv_r2.mean(),
        'r2_std': cv_r2.std(),
        'mse_mean': -cv_mse.mean(),
        'mse_std': cv_mse.std()
    }
    
    print(f"\n{name}:")
    print(f"  R¬≤: {cv_r2.mean():.4f} ¬± {cv_r2.std():.4f}")
    print(f"  MSE: {-cv_mse.mean():.4f} ¬± {cv_mse.std():.4f}")

# Save CV results
with open(str(RESULTS_DIR) + '/cv_results.json', 'w') as f:
    json.dump(cv_results, f, indent=2)

print("\n‚úÖ Cross-validation results saved!")

6. CROSS-VALIDATION ANALYSIS (5-Fold)

Linear (Uni):
  R¬≤: 0.4524 ¬± 0.1773
  MSE: 46.0120 ¬± 11.4529

Linear (Multi):
  R¬≤: 0.6880 ¬± 0.0923
  MSE: 25.9884 ¬± 4.7246

Linear (FS):
  R¬≤: 0.6512 ¬± 0.0902
  MSE: 29.4805 ¬± 6.5223

‚úÖ Cross-validation results saved!


In [None]:
# 7. FINAL MODEL COMPARISON
print("="*60)
print("7. FINAL MODEL COMPARISON")
print("="*60)

all_metrics = [
    metrics_uni,
    metrics_multi,
    metrics_fs,
    results_poly[2]['metrics'],
    results_poly[3]['metrics'],
]

comparison_df = pd.DataFrame(all_metrics)
comparison_df = comparison_df[['model_name', 'train_r2', 'test_r2', 'train_rmse', 'test_rmse', 'cv_r2_mean', 'cv_r2_std']]
comparison_df = comparison_df.sort_values('test_r2', ascending=False)

print("\nModel Performance Ranking (by Test R¬≤):")
print(comparison_df.to_string(index=False))

# Save comparison
comparison_df.to_csv(str(RESULTS_DIR) + '/model_comparison.csv', index=False)

best_model = comparison_df.iloc[0]['model_name']
best_r2 = comparison_df.iloc[0]['test_r2']

print(f"\nüèÜ Best Model: {best_model}")
print(f"   Test R¬≤: {best_r2:.4f}")

print("\n‚úÖ Comparison saved!")

7. FINAL MODEL COMPARISON

Model Performance Ranking (by Test R¬≤):
                           model_name  train_r2  test_r2  train_rmse  test_rmse  cv_r2_mean  cv_r2_std
     Linear Regression (Multivariate)  0.743216 0.709866    4.750834   4.649599    0.688038   0.092316
Linear Regression (Feature Selection)  0.687281 0.651066    5.242799   5.099033    0.651222   0.090185
     Polynomial Regression (degree=3)  0.549087 0.582533    6.295528   5.577342    0.490782   0.204544
     Polynomial Regression (degree=2)  0.536170 0.567166    6.385063   5.679065    0.482943   0.224297
       Linear Regression (Univariate)  0.488686 0.457993    6.703935   6.355044    0.452441   0.177267

üèÜ Best Model: Linear Regression (Multivariate)
   Test R¬≤: 0.7099

‚úÖ Comparison saved!
