In [3]:
import pandas as pd
import numpy as np
import scipy.io
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# --- Configuration ---
RANDOM_STATE = 42

print("Phase 1: Loading and Restructuring Spatio-Temporal Data...")

# 1. Load MAT File (Relative path to the data folder)
# NOTE: This assumes the .mat file is located in the '../data/' directory.
try:
    mat = scipy.io.loadmat('../data/water_dataset.mat')
    print("MATLAB file loaded successfully.")
except FileNotFoundError:
    print("ERROR: water_dataset.mat not found. Please ensure it is in the '../data/' folder.")
    raise

# 2. Extract Raw Data Arrays
# X_tr: (1, 423) array of matrices, where each matrix is (37 locations, 11 features)
# Y_tr: (37, 423) matrix of target pH values
X_tr_raw = mat['X_tr'][0]
Y_tr_raw = mat['Y_tr']
X_te_raw = mat['X_te'][0]
Y_te_raw = mat['Y_te']
print("Extracted X_tr, Y_tr, X_te, Y_te arrays.")


# 3. Flatten Data (Crucial Step: Reshape to (Observations, Features))
# The data needs to be flattened from multi-dimensional arrays into a simple 2D structure
# where each row is a unique Location-Date pair.

# Concatenate all (37, 11) feature matrices over time (423 dates)
X_train_flat = np.concatenate([arr for arr in X_tr_raw], axis=0)
X_test_flat = np.concatenate([arr for arr in X_te_raw], axis=0)

# Flatten the target pH matrix (37 locations x N dates) into a single column.
# 'F' order (Fortran/Column-Major) is used to match the flattening order of the features.
Y_train_flat = Y_tr_raw.flatten(order='F').reshape(-1, 1)
Y_test_flat = Y_te_raw.flatten(order='F').reshape(-1, 1)

print(f"Data restructuring complete. Total training samples: {X_train_flat.shape[0]}")


# 4. Convert to DataFrame and Scale (Required for Robust ML)
# Linear models benefit from scaling. We skip train_test_split here as the data is already split.
scaler = StandardScaler()

# Scale Features
X_train_scaled = scaler.fit_transform(X_train_flat)
X_test_scaled = scaler.transform(X_test_flat)

# Convert to DataFrame for model input
df_X_train = pd.DataFrame(X_train_scaled)
df_X_test = pd.DataFrame(X_test_scaled)
df_Y_train = pd.DataFrame(Y_train_flat, columns=['pH_Target'])
df_Y_test = pd.DataFrame(Y_test_flat, columns=['pH_Target'])


print("\nPhase 2: Linear Regression Baseline Training...")

# 5. Linear Regression Baseline Model (Simple Benchmark)
lr_regressor = LinearRegression()
lr_regressor.fit(df_X_train, df_Y_train)

# 6. Prediction and Evaluation
Y_test_pred = lr_regressor.predict(df_X_test)

# Calculate Metrics
# First, calculate the standard Mean Squared Error (MSE)
mse = mean_squared_error(df_Y_test, Y_test_pred)

# Then, calculate the RMSE by taking the square root
rmse = np.sqrt(mse)
r2 = r2_score(df_Y_test, Y_test_pred)

print("\n--- LINEAR REGRESSION BASELINE PERFORMANCE (pH) ---")
print(f"R-squared Score (R2): {r2:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print("-" * 50)


Phase 1: Loading and Restructuring Spatio-Temporal Data...
MATLAB file loaded successfully.
Extracted X_tr, Y_tr, X_te, Y_te arrays.
Data restructuring complete. Total training samples: 15651

Phase 2: Linear Regression Baseline Training...

--- LINEAR REGRESSION BASELINE PERFORMANCE (pH) ---
R-squared Score (R2): 0.8329
Root Mean Squared Error (RMSE): 0.0120
--------------------------------------------------
