## Neutral Network Model for CA Housing Dataset

#### 1. Prepare Data
Model specific pre-processing, including creating datasets and scaling data for neural network.

In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Import dataset
dataset = pd.read_csv('datasets/preprocessed_dataset.csv')

# Step 1: First split (Train: 60%, Temp: 40%)
train_set, temp_set = train_test_split(dataset, test_size=0.4, random_state=42)

# Step 2: Second split (Validation: 20%, Test: 20% from the 40%)
val_set, test_set = train_test_split(temp_set, test_size=0.5, random_state=42)

# Check sizes
print(f"Train size: {len(train_set)}")
print(f"Validation size: {len(val_set)}")
print(f"Test size: {len(test_set)}")

# Select the columns to scale (last two columns)
cols_to_scale = ['median_income', 'median_house_value']

# Initialize scaler and fit on training data only
scaler = StandardScaler()
train_scaled_values = scaler.fit_transform(train_set[cols_to_scale])
val_scaled_values = scaler.transform(val_set[cols_to_scale])
test_scaled_values = scaler.transform(test_set[cols_to_scale])

# Create new column names
scaled_col_names = [f"{col}_scaled" for col in cols_to_scale]

# Add scaled columns and drop originals
for df, scaled_data in zip([train_set, val_set, test_set], [train_scaled_values, val_scaled_values, test_scaled_values]):
    for orig_col, scaled_col in zip(cols_to_scale, scaled_col_names):
        df[scaled_col] = scaled_data[:, scaled_col_names.index(scaled_col)]
    df.drop(columns=cols_to_scale, inplace=True)

# Select features for model
feature_cols = ['longitude', 'latitude', 'ocean_proximity_encoded', 'median_income_scaled',]
target_col = 'median_house_value_scaled'

# Extract features and targets and convert to numpy
X_train = train_set[feature_cols].to_numpy()
y_train = train_set[target_col].to_numpy()
X_val = val_set[feature_cols].to_numpy()
y_val = val_set[target_col].to_numpy()
X_test = test_set[feature_cols].to_numpy()
y_test = test_set[target_col].to_numpy()

Train size: 11027
Validation size: 3676
Test size: 3676


#### Define Model Parameters
Define Model and Hyperparameter Set


In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

# Define Hyperparameters
param_grid = {
    'hidden_layer_sizes': [(8,), (16, 8), (32, 16, 8)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001],  # L2 regularization strength
    'solver': ['adam'],
    'max_iter': [1000]
}

# Create base model
mlp = MLPRegressor(random_state=42)

# GridSearchCV with 3-fold cross-validation on training set
grid_search = GridSearchCV(mlp, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit only on the training set
grid_search.fit(X_train, y_train)



NameError: name 'X_train' is not defined