In [1]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1_l2
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import os
import tensorflow as tf

2025-05-15 09:36:56.407399: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-05-15 09:36:56.616802: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-05-15 09:36:56.616822: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2025-05-15 09:36:56.648974: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-15 09:36:57.605329: W tensorflow/stream_executor/platform/de

In [None]:
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Start Spark session
spark = SparkSession.builder.appName("ImprovedNeuralNetwork").getOrCreate()

# Load data
df = spark.read.csv("data_preprocessed.csv", header=True, inferSchema=True)

In [3]:
# Convert to Pandas right away
pandas_df = df.toPandas()

# Print data information
print(f"Total samples: {pandas_df.shape[0]}")
print(f"Number of features: {pandas_df.shape[1] - 1}")

# Separate features and target
X = pandas_df.drop('price', axis=1)
y = pandas_df['price']


Total samples: 14416
Number of features: 36


In [4]:
# Scale features using RobustScaler
feature_scaler = RobustScaler()
X_scaled = feature_scaler.fit_transform(X)

# Scale target variable
target_scaler = RobustScaler()
y_scaled = target_scaler.fit_transform(y.values.reshape(-1, 1)).flatten()

# Define k-fold cross validation
k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

In [5]:
# Lists to store metrics across folds
r2_scores = []
rmse_scores = []
mae_scores = []
fold_predictions = []
fold_actuals = []
fold_models = []

In [6]:
def create_improved_model(input_dim):
    model = Sequential([
        # First layer - wider
        Dense(64, kernel_initializer='he_normal', kernel_regularizer=l1_l2(l1=0.0001, l2=0.0005), input_shape=(input_dim,)),
        LeakyReLU(alpha=0.1),
        BatchNormalization(),
        Dropout(0.3),
        
        # Second layer
        Dense(32, kernel_initializer='he_normal', kernel_regularizer=l1_l2(l1=0.0001, l2=0.0005)),
        LeakyReLU(alpha=0.1),
        BatchNormalization(),
        Dropout(0.3),
        
        # Third layer
        Dense(16, kernel_initializer='he_normal', kernel_regularizer=l1_l2(l1=0.0001, l2=0.0005)),
        LeakyReLU(alpha=0.1),
        BatchNormalization(),
        Dropout(0.2),
        
        # Fourth layer - added an extra layer for more capacity
        Dense(8, kernel_initializer='he_normal', kernel_regularizer=l1_l2(l1=0.0001, l2=0.0005)),
        LeakyReLU(alpha=0.1),
        BatchNormalization(),
        Dropout(0.1),
        
        # Output layer
        Dense(1)
    ])
    
    # IMPROVED: Compile with better optimizer settings
    optimizer = Adam(learning_rate=0.0005)
    model.compile(optimizer=optimizer, loss='huber')
    return model

In [None]:
# Train models across folds
print(f"Training {k_folds} models with cross-validation...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled)):
    print(f"\nFold {fold+1}/{k_folds}")
    
    # Split data
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y_scaled[train_idx], y_scaled[val_idx]
    
    # Create and train model
    model = create_improved_model(X_scaled.shape[1])
    
    # IMPROVED: Better callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss', 
        patience=30,
        restore_best_weights=True, 
        verbose=1
    )
    
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss', 
        factor=0.2, 
        patience=10, 
        min_lr=0.000005, 
        verbose=1
    )
    
    model_checkpoint = ModelCheckpoint(
        f'best_model_fold_{fold+1}.h5',
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    )
    
    # IMPROVED: Better training settings
    history = model.fit(
        X_train, y_train,
        epochs=500,  # More epochs but with early stopping
        batch_size=32,  # Smaller batch size for better generalization
        validation_data=(X_val, y_val),
        callbacks=[early_stopping, reduce_lr, model_checkpoint],
        verbose=1
    )
    
    # Load the best model from checkpoint
    model = load_model(f'best_model_fold_{fold+1}.h5')
    fold_models.append(model)
    
    # Predict on validation set
    y_pred_scaled = model.predict(X_val)
    y_pred = target_scaler.inverse_transform(y_pred_scaled)
    y_true = target_scaler.inverse_transform(y_val.reshape(-1, 1))
    
    # Calculate metrics
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    # Store results
    r2_scores.append(r2)
    rmse_scores.append(rmse)
    mae_scores.append(mae)
    
    # Store predictions and actuals for ensemble
    fold_predictions.append(y_pred)
    fold_actuals.append(y_true)
    
    # Print fold results
    print(f"Fold {fold+1} - RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.4f}")

In [None]:
# Calculate and print average metrics
print("\nCross-validation results:")
print(f"Average R²: {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")
print(f"Average RMSE: {np.mean(rmse_scores):.2f} ± {np.std(rmse_scores):.2f}")
print(f"Average MAE: {np.mean(mae_scores):.2f} ± {np.std(mae_scores):.2f}")


In [None]:
# IMPROVED: Create weighted ensemble prediction
print("\nCreating weighted ensemble prediction...")

def ensemble_predict(models, X_data, weights=None):
    """Create weighted ensemble predictions from multiple models"""
    predictions = np.array([model.predict(X_data).flatten() for model in models])
    
    if weights is None:
        # Use equal weights if none provided
        weights = np.ones(len(models)) / len(models)
    
    # Weighted average of predictions
    weighted_predictions = np.sum(predictions.T * weights, axis=1)
    return weighted_predictions.reshape(-1, 1)

# Use inverse of validation RMSE as weights (better models get higher weights)
inverse_rmse = [1/score for score in rmse_scores]
weights = np.array(inverse_rmse) / sum(inverse_rmse)
print(f"Ensemble weights based on validation performance: {weights}")

In [None]:
# Combine validation sets and make predictions for ensemble evaluation
# This is for evaluation only - in practice you'd use a separate test set
all_preds = []
all_actuals = []

for i, (_, val_idx) in enumerate(kf.split(X_scaled)):
    X_val = X_scaled[val_idx]
    y_val = y_scaled[val_idx]
    
    # Get ensemble prediction for this validation fold
    y_pred_scaled = ensemble_predict([fold_models[j] for j in range(k_folds) if j != i], 
                                    X_val, 
                                    weights=[weights[j] for j in range(k_folds) if j != i])
    
    # Convert back to original scale
    y_pred = target_scaler.inverse_transform(y_pred_scaled)
    y_true = target_scaler.inverse_transform(y_val.reshape(-1, 1))
    
    all_preds.append(y_pred)
    all_actuals.append(y_true)

# Concatenate all validation predictions and actuals
all_preds = np.concatenate(all_preds)
all_actuals = np.concatenate(all_actuals)

In [None]:
# Calculate ensemble metrics
ensemble_mse = mean_squared_error(all_actuals, all_preds)
ensemble_rmse = np.sqrt(ensemble_mse)
ensemble_mae = mean_absolute_error(all_actuals, all_preds)
ensemble_r2 = r2_score(all_actuals, all_preds)

print("\nWeighted Ensemble model results:")
print(f"Ensemble RMSE: {ensemble_rmse:.2f}")
print(f"Ensemble MAE: {ensemble_mae:.2f}")
print(f"Ensemble R²: {ensemble_r2:.4f}")

In [None]:
def create_final_model(input_dim):
    model = Sequential([
        # First layer
        Dense(96, kernel_initializer='he_normal', kernel_regularizer=l1_l2(l1=0.0001, l2=0.0003), input_shape=(input_dim,)),
        LeakyReLU(alpha=0.1),
        BatchNormalization(),
        Dropout(0.3),
        
        # Second layer
        Dense(48, kernel_initializer='he_normal', kernel_regularizer=l1_l2(l1=0.0001, l2=0.0003)),
        LeakyReLU(alpha=0.1),
        BatchNormalization(),
        Dropout(0.3),
        
        # Third layer
        Dense(24, kernel_initializer='he_normal', kernel_regularizer=l1_l2(l1=0.0001, l2=0.0003)),
        LeakyReLU(alpha=0.1),
        BatchNormalization(),
        Dropout(0.2),
        
        # Fourth layer
        Dense(12, kernel_initializer='he_normal', kernel_regularizer=l1_l2(l1=0.0001, l2=0.0003)),
        LeakyReLU(alpha=0.1),
        BatchNormalization(),
        Dropout(0.1),
        
        # Output layer
        Dense(1)
    ])
    
    # Compile with optimal settings
    optimizer = Adam(learning_rate=0.0005)
    model.compile(optimizer=optimizer, loss='huber')
    return model

final_model = create_final_model(X_scaled.shape[1])

In [None]:

early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience=40,  # More patience for final model
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.2, 
    patience=15, 
    min_lr=0.000001
)

model_checkpoint = ModelCheckpoint(
    'best_final_model.h5',
    monitor='val_loss',
    save_best_only=True
)

In [None]:
# Train with a fixed validation split
val_split = 0.2
n_val = int(len(X_scaled) * val_split)
indices = np.random.permutation(len(X_scaled))
X_train_final = X_scaled[indices[n_val:]]
y_train_final = y_scaled[indices[n_val:]]
X_val_final = X_scaled[indices[:n_val]]
y_val_final = y_scaled[indices[:n_val]]

# Train
final_history = final_model.fit(
    X_train_final, y_train_final,
    epochs=500,
    batch_size=32,
    validation_data=(X_val_final, y_val_final),
    callbacks=[early_stopping, reduce_lr, model_checkpoint],
    verbose=1
)

In [None]:
# Load the best final model
final_model = load_model('best_final_model.h5')

# Evaluate final model
y_pred_scaled = final_model.predict(X_val_final)
y_pred = target_scaler.inverse_transform(y_pred_scaled)
y_true = target_scaler.inverse_transform(y_val_final.reshape(-1, 1))

final_mse = mean_squared_error(y_true, y_pred)
final_rmse = np.sqrt(final_mse)
final_mae = mean_absolute_error(y_true, y_pred)
final_r2 = r2_score(y_true, y_pred)

print("\nFinal model validation results:")
print(f"Final RMSE: {final_rmse:.2f}")
print(f"Final MAE: {final_mae:.2f}")
print(f"Final R²: {final_r2:.4f}")

In [None]:
# Save final model
final_model.save('improved_final_model.h5')
print("Final model saved as 'improved_final_model.h5'")

In [None]:
spark.stop()