# Importing necessary libraries

In [135]:
import copy, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

# Loading the data

In [27]:
housing_data = pd.read_csv('California_Houses.csv')

# Informations about the dataset


In [29]:
print("Dataset Shape:", housing_data.shape)
print("\nFirst few rows of the dataset:")
print(housing_data.head())
print("\nColumn information:")
print(housing_data.info())
print("\nSummary statistics:")
print(housing_data.describe())

Dataset Shape: (20640, 14)

First few rows of the dataset:
   Median_House_Value  Median_Income  Median_Age  Tot_Rooms  Tot_Bedrooms  \
0            452600.0         8.3252          41        880           129   
1            358500.0         8.3014          21       7099          1106   
2            352100.0         7.2574          52       1467           190   
3            341300.0         5.6431          52       1274           235   
4            342200.0         3.8462          52       1627           280   

   Population  Households  Latitude  Longitude  Distance_to_coast  \
0         322         126     37.88    -122.23        9263.040773   
1        2401        1138     37.86    -122.22       10225.733072   
2         496         177     37.85    -122.24        8259.085109   
3         558         219     37.85    -122.25        7768.086571   
4         565         259     37.85    -122.25        7768.086571   

   Distance_to_LA  Distance_to_SanDiego  Distance_to_SanJose  \

# Check for missing values

In [59]:
print(housing_data.isnull().sum())

Median_House_Value          0
Median_Income               0
Median_Age                  0
Tot_Rooms                   0
Tot_Bedrooms                0
Population                  0
Households                  0
Latitude                    0
Longitude                   0
Distance_to_coast           0
Distance_to_LA              0
Distance_to_SanDiego        0
Distance_to_SanJose         0
Distance_to_SanFrancisco    0
dtype: int64


# Split the data into features and target


In [35]:
X_housing = housing_data.drop('Median_House_Value', axis=1)
y_housing = housing_data['Median_House_Value']

# Split the data into training (70%), validation (15%), and testing (15%)

In [41]:
from sklearn.model_selection import train_test_split

X_train_h, X_temp_h, y_train_h, y_temp_h = train_test_split(X_housing, y_housing, test_size=0.3, random_state=42)
X_val_h, X_test_h, y_val_h, y_test_h = train_test_split(X_temp_h, y_temp_h, test_size=0.5, random_state=42)

# Data Split Sizes

In [46]:
print(f"Training set: {X_train_h.shape[0]} samples")
print(f"Validation set: {X_val_h.shape[0]} samples")
print(f"Testing set: {X_test_h.shape[0]} samples")

Training set: 14448 samples
Validation set: 3096 samples
Testing set: 3096 samples



### z-score normalization 
After z-score normalization, all features will have a mean of 0 and a standard deviation of 1.

All values are adjusted as shown in this formula:
$$x^{(i)}_j = \dfrac{x^{(i)}_j - \mu_j}{\sigma_j} $$ 
where $j$ selects a feature or a column in the $\mathbf{X}$ matrix. $µ_j$ is the mean of all the values for feature (j) and $\sigma_j$ is the standard deviation of feature (j).
$$
\begin{align}
\mu_j &= \frac{1}{m} \sum_{i=0}^{m-1} x^{(i)}_j \\
\sigma^2_j &= \frac{1}{m} \sum_{i=0}^{m-1} (x^{(i)}_j - \mu_j)^2  
\end{align}
$$

In [49]:
def zscore_normalize_features(X):
    """
    computes  X, zcore normalized by column
    
    Args:
      X (ndarray (m,n))     : input data, m examples, n features
      
    Returns:
      X_norm (ndarray (m,n)): input normalized by column
      mu (ndarray (n,))     : mean of each feature
      sigma (ndarray (n,))  : standard deviation of each feature
    """
    # find the mean of each column/feature
    mu     = np.mean(X, axis=0)                 # mu will have shape (n,)
    # find the standard deviation of each column/feature
    sigma  = np.std(X, axis=0)                  # sigma will have shape (n,)
    # element-wise, subtract mu for that column from each example, divide by std for that column
    X_norm = (X - mu) / sigma      

    return (X_norm, mu, sigma)
 
#check our work
#from sklearn.preprocessing import scale
#scale(X_orig, axis=0, with_mean=True, with_std=True, copy=True)

In [51]:
X_train_h_scaled, mu, sigma = zscore_normalize_features(X_train_h)

# Apply the same transformation to validation and test sets using training data statistics
X_val_h_scaled = (X_val_h - mu) / sigma
X_test_h_scaled = (X_test_h - mu) / sigma

# Applying Linear Regression
## Define gradient descent function for linear regression

In [114]:
def compute_cost(X, y, w, b):
    """
    Compute the Mean Squared Error (MSE) cost function.
    """
    m = X.shape[0]  # Number of training examples
    predictions = np.dot(X, w) + b
    cost = np.sum((predictions - y) ** 2) / (2 * m)
    return cost

In [116]:
def compute_gradient(X, y, w, b):
    """
    Compute gradients for linear regression parameters.
    """
    m = X.shape[0]
    predictions = np.dot(X, w) + b
    errors = predictions - y
    dj_dw = np.dot(X.T, errors) / m
    dj_db = np.sum(errors) / m
    return dj_dw, dj_db

In [118]:
def gradient_descent(X, y, w, b, alpha, num_iters):
    """
    Perform batch gradient descent to optimize w and b.
    """
    cost_history = []
    for i in range(num_iters):
        dj_dw, dj_db = compute_gradient(X, y, w, b)
        w -= alpha * dj_dw
        b -= alpha * dj_db
        cost = compute_cost(X, y, w, b)
        cost_history.append(cost)
        
        if i % (num_iters // 10) == 0:
            print(f"Iteration {i}: Cost {cost:.4f}")
    
    return w, b, cost_history

In [121]:
def predict(X, w, b):
    """
    Generate predictions using learned parameters.
    """
    return np.dot(X, w) + b

In [129]:
# Initialize model parameters
w_init = np.zeros(X_train_h_scaled.shape[1])
b_init = 0.0
alpha = 0.01  # Learning rate
iterations = 1000  # Number of iterations

# Train the model
w_final, b_final, cost_history = gradient_descent(X_train_h_scaled, y_train_h, w_init, b_init, alpha, iterations)

# Make predictions on the validation set
y_val_pred = predict(X_val_h_scaled, w_final, b_final)

# Compute performance metrics
mse_val = np.mean((y_val_pred - y_val_h) ** 2)
mae_val = np.mean(np.abs(y_val_pred - y_val_h))
rmse_val = np.sqrt(mse_val)

print("\nValidation Set:")
print(f"Mean Squared Error: {mse_val:.2f}")
print(f"Mean Absolute Error: {mae_val:.2f}")
print(f"Root Mean Squared Error: {rmse_val:.2f}")

# Test the model on the test set
y_test_pred = predict(X_test_h_scaled, w_final, b_final)

# Compute test performance metrics
mse_test = np.mean((y_test_pred - y_test_h) ** 2)
mae_test = np.mean(np.abs(y_test_pred - y_test_h))
rmse_test = np.sqrt(mse_test)

print("\nTest Set:")
print(f"Mean Squared Error: {mse_test:.2f}")
print(f"Mean Absolute Error: {mae_test:.2f}")
print(f"Root Mean Squared Error: {rmse_test:.2f}")

Iteration 0: Cost 27577059994.2481
Iteration 100: Cost 5787675817.6725
Iteration 200: Cost 3016082442.8218
Iteration 300: Cost 2622011229.5397
Iteration 400: Cost 2544073247.7819
Iteration 500: Cost 2513840526.1874
Iteration 600: Cost 2494182868.4871
Iteration 700: Cost 2479040929.6061
Iteration 800: Cost 2466857016.7979
Iteration 900: Cost 2456889302.2467

Validation Set:
Mean Squared Error: 5099618793.24
Mean Absolute Error: 52316.48
Root Mean Squared Error: 71411.62

Test Set:
Mean Squared Error: 4585260876.30
Mean Absolute Error: 50393.83
Root Mean Squared Error: 67714.55


"""
# Applying Lasso Regression (L1 Regularization)

Lasso regression adds L1 regularization to linear regression, which penalizes large coefficients using the sum of their absolute values. This promotes sparsity in the model and can be used for feature selection.

$$ \text{Cost} = MSE + \alpha \sum_{j=1}^{n} |w_j| $$

Where:
- MSE is the mean squared error
- $\alpha$ is the regularization strength
- $w_j$ are the feature weights
"""

In [137]:
def train_lasso_models(X_train, y_train, X_val, y_val):
    alpha_values = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
    best_alpha = None
    best_val_mse = float('inf')
    lasso_results = []
    
    for alpha in alpha_values:
        lasso_model = Lasso(alpha=alpha, max_iter=10000)
        lasso_model.fit(X_train, y_train)
        y_val_pred = lasso_model.predict(X_val)
        
        val_mse = mean_squared_error(y_val, y_val_pred)
        val_mae = mean_absolute_error(y_val, y_val_pred)
        val_rmse = np.sqrt(val_mse)
        
        lasso_results.append({
            'alpha': alpha,
            'validation_mse': val_mse,
            'validation_mae': val_mae,
            'validation_rmse': val_rmse
        })
        
        print(f"Lasso (alpha={alpha}):")
        print(f"  Validation MSE: {val_mse:.2f}")
        print(f"  Validation MAE: {val_mae:.2f}")
        print(f"  Validation RMSE: {val_rmse:.2f}")
        
        if val_mse < best_val_mse:
            best_val_mse = val_mse
            best_alpha = alpha
    
    print(f"\nBest Lasso alpha: {best_alpha} with Validation MSE: {best_val_mse:.2f}")
    return best_alpha, lasso_results

In [139]:
best_lasso_alpha, lasso_results = train_lasso_models(X_train_h_scaled, y_train_h, X_val_h_scaled, y_val_h)

# Train the best Lasso model and evaluate on test set
best_lasso_model = Lasso(alpha=best_lasso_alpha, max_iter=10000)
best_lasso_model.fit(X_train_h_scaled, y_train_h)

# Make predictions with best Lasso model
y_val_pred_lasso = best_lasso_model.predict(X_val_h_scaled)
y_test_pred_lasso = best_lasso_model.predict(X_test_h_scaled)

# Compute performance metrics for validation set
mse_val_lasso = mean_squared_error(y_val_h, y_val_pred_lasso)
mae_val_lasso = mean_absolute_error(y_val_h, y_val_pred_lasso)
rmse_val_lasso = np.sqrt(mse_val_lasso)

# Compute performance metrics for test set
mse_test_lasso = mean_squared_error(y_test_h, y_test_pred_lasso)
mae_test_lasso = mean_absolute_error(y_test_h, y_test_pred_lasso)
rmse_test_lasso = np.sqrt(mse_test_lasso)

print(f"\nLasso Regression (alpha={best_lasso_alpha}) - Final Results:")
print("\nValidation Set:")
print(f"Mean Squared Error: {mse_val_lasso:.2f}")
print(f"Mean Absolute Error: {mae_val_lasso:.2f}")
print(f"Root Mean Squared Error: {rmse_val_lasso:.2f}")

print("\nTest Set:")
print(f"Mean Squared Error: {mse_test_lasso:.2f}")
print(f"Mean Absolute Error: {mae_test_lasso:.2f}")
print(f"Root Mean Squared Error: {rmse_test_lasso:.2f}")


Lasso (alpha=0.001):
  Validation MSE: 4907212012.46
  Validation MAE: 50790.06
  Validation RMSE: 70051.50
Lasso (alpha=0.01):
  Validation MSE: 4907212148.34
  Validation MAE: 50790.07
  Validation RMSE: 70051.50
Lasso (alpha=0.1):
  Validation MSE: 4907213520.76
  Validation MAE: 50790.13
  Validation RMSE: 70051.51
Lasso (alpha=1.0):
  Validation MSE: 4907228606.53
  Validation MAE: 50790.78
  Validation RMSE: 70051.61
Lasso (alpha=10.0):
  Validation MSE: 4907515542.92
  Validation MAE: 50797.43
  Validation RMSE: 70053.66
Lasso (alpha=100.0):
  Validation MSE: 4923933009.24
  Validation MAE: 50975.80
  Validation RMSE: 70170.74

Best Lasso alpha: 0.001 with Validation MSE: 4907212012.46

Lasso Regression (alpha=0.001) - Final Results:

Validation Set:
Mean Squared Error: 4907212012.46
Mean Absolute Error: 50790.06
Root Mean Squared Error: 70051.50

Test Set:
Mean Squared Error: 4400953043.36
Mean Absolute Error: 48782.03
Root Mean Squared Error: 66339.68


In [None]:
def train_ridge_models(X_train, y_train, X_val, y_val): 
    alpha_values = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] 
    best_alpha = None 
    best_val_mse = float('inf') 
    ridge_results = [] 
     
    for alpha in alpha_values: 
        ridge_model = Ridge(alpha=alpha, max_iter=10000) 
        ridge_model.fit(X_train, y_train) 
        y_val_pred = ridge_model.predict(X_val) 
         
        val_mse = mean_squared_error(y_val, y_val_pred) 
        val_mae = mean_absolute_error(y_val, y_val_pred) 
        val_rmse = np.sqrt(val_mse) 
         
        ridge_results.append({ 
            'alpha': alpha, 
            'validation_mse': val_mse, 
            'validation_mae': val_mae, 
            'validation_rmse': val_rmse 
        }) 
         
        print(f"Ridge (alpha={alpha}):") 
        print(f"  Validation MSE: {val_mse:.2f}") 
        print(f"  Validation MAE: {val_mae:.2f}") 
        print(f"  Validation RMSE: {val_rmse:.2f}") 
         
        if val_mse < best_val_mse: 
            best_val_mse = val_mse 
            best_alpha = alpha 
     
    print(f"\nBest Ridge alpha: {best_alpha} with Validation MSE: {best_val_mse:.2f}") 
    return best_alpha, ridge_results

best_ridge_alpha, ridge_results = train_ridge_models(X_train_h_scaled, y_train_h, X_val_h_scaled, y_val_h) 
 
# Train the best Ridge model and evaluate on test set 
best_ridge_model = Ridge(alpha=best_ridge_alpha, max_iter=10000) 
best_ridge_model.fit(X_train_h_scaled, y_train_h) 
 
# Make predictions with best Ridge model 
y_val_pred_ridge = best_ridge_model.predict(X_val_h_scaled) 
y_test_pred_ridge = best_ridge_model.predict(X_test_h_scaled) 
 
# Compute performance metrics for validation set 
mse_val_ridge = mean_squared_error(y_val_h, y_val_pred_ridge) 
mae_val_ridge = mean_absolute_error(y_val_h, y_val_pred_ridge) 
rmse_val_ridge = np.sqrt(mse_val_ridge) 
 
# Compute performance metrics for test set 
mse_test_ridge = mean_squared_error(y_test_h, y_test_pred_ridge) 
mae_test_ridge = mean_absolute_error(y_test_h, y_test_pred_ridge) 
rmse_test_ridge = np.sqrt(mse_test_ridge) 
 
print(f"\nRidge Regression (alpha={best_ridge_alpha}) - Final Results:") 
print("\nValidation Set:") 
print(f"Mean Squared Error: {mse_val_ridge:.2f}") 
print(f"Mean Absolute Error: {mae_val_ridge:.2f}") 
print(f"Root Mean Squared Error: {rmse_val_ridge:.2f}") 
 
print("\nTest Set:") 
print(f"Mean Squared Error: {mse_test_ridge:.2f}") 
print(f"Mean Absolute Error: {mae_test_ridge:.2f}")
print(f"Root Mean Squared Error: {rmse_test_ridge:.2f}")