# ML Baseline Models
Objectives:
- Establish baseline predictive models for estimating subsurface pressure behavior within the CO₂ storage reservoir.
- Train a suite of classical regression models - Linear Regression, Random Forest, and Gradient Boosting - to provide interpretable baseline performance benchmarks.
- Introduce a neural-network–based baseline (MLP) to assess the advantages of nonlinear, high-capacity function approximators for modeling complex reservoir dynamics.
- Apply consistent training and validation workflows, including data scaling, early stopping, and standardized evaluation metrics (MSE, R²).

**Source:** Society of Petroleum Engineers (SPE)  
**Dataset:** SPE Comparative Solution Project - Model 11C (3D CO₂ Injection)

In [1]:
# Import libraries
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import joblib

In [2]:
# Load data
PROCESSED = Path(r"C:\Users\tetec\Documents\Data Project Coding\.vscode\Project data\spe11c\data\processed")

X_train_scaled = np.load(PROCESSED / 'X_train_scaled.npy')
X_test_scaled = np.load(PROCESSED / 'X_test_scaled.npy')
y_train = np.load(PROCESSED / 'y_train.npy')
y_test = np.load(PROCESSED / 'y_test.npy')

print(X_train_scaled.shape, y_train.shape)


(3200, 27) (3200,)


## 1. Train Classical Regression Models

In [3]:
# basic models
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

In [4]:
models_dir = Path(r"C:\Users\tetec\Documents\Data Project Coding\.vscode\Project data\spe11c\models")
models_dir.mkdir(parents=True, exist_ok=True)

# Train and Evaluate
results = []

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_scaled, y_train)
    
    y_pred = model.predict(X_test_scaled)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"{name} - MSE: {mse:.4f}, R2: {r2:.4f}\n")
    
    # save model
    joblib.dump(model, rf'C:\Users\tetec\Documents\Data Project Coding\.vscode\Project data\spe11c\models\{name}_baseline.pkl')
    
    results.append({'model': name, 'MSE': mse, 'R2': r2})

# Summary dataframe
results_df = pd.DataFrame(results)
results_df

Training LinearRegression...
LinearRegression - MSE: 18.0200, R2: 1.0000

Training RandomForest...
RandomForest - MSE: 357547962.5000, R2: 1.0000

Training GradientBoosting...
GradientBoosting - MSE: 529043140.6517, R2: 1.0000



Unnamed: 0,model,MSE,R2
0,LinearRegression,18.02,1.0
1,RandomForest,357548000.0,0.999966
2,GradientBoosting,529043100.0,0.99995


## 2. Neural Network Baseline model

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train.reshape(-1,1)).flatten()
y_test_scaled = y_scaler.transform(y_test.reshape(-1,1)).flatten()

# def mlp
def build_mlp(input_dim):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(128, activation='relu'),
        layers.Dense(128, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(1, activation='linear')
    ])
    return model

mlp = build_mlp(X_train_scaled.shape[1])

mlp.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss='mse',
    metrics=['mse']
)

# Early stopping
es = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

# Train
history = mlp.fit(
    X_train_scaled,
    y_train_scaled,
    validation_split=0.2,
    epochs=300,
    batch_size=32,
    callbacks=[es],
    verbose=1
)

y_pred_scaled = mlp.predict(X_test_scaled).flatten()
y_pred = y_scaler.inverse_transform(y_pred_scaled.reshape(-1,1)).flatten()

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"NN MSE: {mse:.4f}")
print(f"NN R2: {r2:.4f}")

Epoch 1/300
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.0548 - mse: 0.0548 - val_loss: 0.0029 - val_mse: 0.0029
Epoch 2/300
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0015 - mse: 0.0015 - val_loss: 0.0019 - val_mse: 0.0019
Epoch 3/300
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 4.7930e-04 - mse: 4.7930e-04 - val_loss: 0.0014 - val_mse: 0.0014
Epoch 4/300
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 9.8497e-04 - mse: 9.8497e-04 - val_loss: 8.8218e-04 - val_mse: 8.8218e-04
Epoch 5/300
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0058 - mse: 0.0058 - val_loss: 0.0022 - val_mse: 0.0022
Epoch 6/300
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 6.7291e-04 - mse: 6.7291e-04 - val_loss: 5.0508e-04 - val_mse: 5.0508e-04
Epoch 7/300
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0

In [6]:
mlp.save(r"C:\Users\tetec\Documents\Data Project Coding\.vscode\Project data\spe11c\models\MLP_baseline.h5")

