## Neutral Network Model for CA Housing Dataset

#### 1. Prepare Data
Model specific pre-processing, including creating datasets and scaling data for neural network.

In [109]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Import dataset
dataset = pd.read_csv('datasets/preprocessed_dataset.csv')

# Step 1: First split (Train: 80%, Test: 20%)
train_set, test_set = train_test_split(dataset, test_size=0.2, random_state=42)

# Check sizes
print(f"Train size: {len(train_set)}")
print(f"Test size: {len(test_set)}")

# Select the columns to scale (last two columns)
cols_to_scale = ['longitude', 'latitude', 'median_income', 'median_house_value']

# Initialize scaler and fit on training data only
scaler = StandardScaler()
train_scaled_values = scaler.fit_transform(train_set[cols_to_scale])
test_scaled_values = scaler.transform(test_set[cols_to_scale])

# Create new column names
scaled_col_names = [f"{col}_scaled" for col in cols_to_scale]

# Add scaled columns and drop originals
for df, scaled_data in zip([train_set, test_set], [train_scaled_values, test_scaled_values]):
    for orig_col, scaled_col in zip(cols_to_scale, scaled_col_names):
        df[scaled_col] = scaled_data[:, scaled_col_names.index(scaled_col)]

print(train_set.head())

# Select features for model
feature_cols = ['longitude_scaled', 'latitude_scaled', 'ocean_proximity_encoded', 'median_income_scaled',]
target_col = 'median_house_value_scaled'

#feature_cols = ['longitude_scaled', 'latitude_scaled']
#target_col = 'ocean_proximity_encoded'

# Extract features and targets and convert to numpy
X_train = train_set[feature_cols].to_numpy()
y_train = train_set[target_col].to_numpy()
X_test = test_set[feature_cols].to_numpy()
y_test = test_set[target_col].to_numpy()

Train size: 14703
Test size: 3676
       longitude  latitude  median_income  median_house_value ocean_proximity  \
1310     -122.10     37.89         5.2079            310300.0        NEAR BAY   
16156    -122.10     37.40         4.3077            293500.0        NEAR BAY   
10301    -118.07     33.80         7.1221            384500.0       <1H OCEAN   
8823     -121.07     39.09         2.8864            143100.0          INLAND   
10428    -120.07     39.24         4.9620            169500.0          INLAND   

       ocean_proximity_encoded  longitude_scaled  latitude_scaled  \
1310                         2         -1.318283         1.069327   
16156                        2         -1.318283         0.840931   
10301                        3          0.716643        -0.837078   
8823                         4         -0.798191         1.628663   
10428                        4         -0.293246         1.698580   

       median_income_scaled  median_house_value_scaled  
1310   

#### Define Model Parameters
1. Define **Hyperparameter** Set for **Grid Search**
2. Define model --> use grid search to **optimize/train model**
3. **Test model** and report results


In [122]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error

# Define hyperparameters to search over
param_grid = {
    'hidden_layer_sizes': [ # Many iterations were run to find a good set of layer sizes to test
        (16, 32, 32, 16),   # (Stoped once improvments became marginal)
        (16,32,64,32,16), 
        (16,64,128,64,16), 
        (32,64,128,64,32)
    ], 
    'activation': ['relu', 'tanh'], # Tested different Activation Functions
    'alpha': [0.0001, 0.001],       # Regularizarion values to test
    'solver': ['adam'],             # Always use adam optimization model
    'max_iter': [1000],             # Alwayys use 1000 iterations
    'early_stopping': [True],       # Use validation set to benchmark model performance and limit overfitting
    'n_iter_no_change' : [50],      # High Validation Patience to overcome it stopping too early
    'validation_fraction': [0.2]    # 20% of training data used as validation during training
}

# Define base model
mlp = MLPRegressor(random_state=42,)

# Perform Grid Search with 3-fold cross-validation
grid_search = GridSearchCV(
    mlp,
    param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

# Fit only on the training set
grid_search.fit(X_train, y_train)

from sklearn.metrics import accuracy_score

# Evaluate best model on the validation set
best_model = grid_search.best_estimator_
test_preds = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, test_preds)
test_mape = mean_absolute_percentage_error(y_test, test_preds)
test_r2 = r2_score(y_test, test_preds)

print("Best parameters found:", grid_search.best_params_)
print(f"Test MSE with best model: {test_mse:.4f}")
print(f"Test MAPE with best model : {test_mape:.4f}")
print(f"Test R² score: {test_r2:.4f}")

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best parameters found: {'activation': 'tanh', 'alpha': 0.001, 'early_stopping': True, 'hidden_layer_sizes': (16, 64, 128, 64, 16), 'max_iter': 1000, 'n_iter_no_change': 50, 'solver': 'adam', 'validation_fraction': 0.2}
Test MSE with best model: 0.2387
Test MAPE with best model : 1.6932
Test R² score: 0.7573
