In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import train_test_split
from pysindy import SINDy
from sklearn.preprocessing import StandardScaler
from pysindy.optimizers import STLSQ
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix 

In [2]:
df = pd.read_csv("processed_well_data.csv")

In [3]:
print(df.isnull().sum())

Well no                    0
Dia                        0
Dev(deg)                   0
LiquidFlowrate             0
Gasflowrate                0
Area (m2)                  0
z                          0
GasDens                    0
Condesnate Presence        0
Water presence             0
LiquidDens                 0
GasVis                     0
LiqVis                     0
g (m/s2)                   0
Test Vg                    0
P/T                        0
Test status                0
Vsg                        0
Vsl                        0
Rel                        0
Reg                        0
film thickness             0
d(0,90)                    0
d(15,90)                   0
d(30,90)                   0
d(45,90)                   0
d(60,90)                   0
d(90,90)                   0
d(120,90)                  0
d(150,90)                  0
d(180,90)                  0
d(210,90)                  0
d(270,90)                  0
d(360,90)                  0
friction_facto

# Data preparation 

In [4]:
# Split features
X = df[['Dia', 'Dev(deg)','Area (m2)', 'z','GasDens','LiquidDens', 'P/T','friction_factor', 'critical_film_thickness']]
y = df['Qcr']
gsflow = df['Gasflowrate']  # This is your additional target for classification metrics

# load class labels: loaded/unloaded/near loaded
loading_class = df['Test status'].apply(
    lambda x: -1 if x == 'Unloaded' else (0 if x == 'Near L.U' else 1)).to_numpy()

# Perform the train-test split, making sure to split all targets simultaneously
X_train, X_test, y_train, y_test, gsflow_train, gsflow_test, loading_train, loading_test = train_test_split(
    X, y, gsflow, loading_class, test_size=0.2, random_state=42
)

# Scale your features and continuous target (Qcr)
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)
X_scaled = scaler_X.transform(X)

y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1))

# t_train is just an index array for plotting
t_train = np.arange(len(y_train_scaled))

#convert to a numpy array and store test data 
loading_test = np.array(loading_test)
gsflow_test = np.array(gsflow_test)
y_test = np.array(y_test)

In [5]:
# accuracy metric and confusion matrix 
def calculate_accuracy(y_pred, gsflow, loading_actual, interval):
    loading_pred = np.where(y_pred > gsflow + interval, 1, 
                          np.where(y_pred < gsflow - interval, -1, 0))
    return accuracy_score(loading_actual, loading_pred), confusion_matrix(loading_actual, loading_pred,  labels=[-1, 0, 1])

In [6]:
# K-fold cross validation
def evaluate_sindy(params, X, y, gsflow, loading_class, cv_splits=5):
    alpha, threshold, interval = params
    kf = KFold(n_splits=cv_splits, shuffle=True, random_state=42)
    acc_scores = []
    
    for train_idx, val_idx in kf.split(X):  # Using validation split from training data
        # divide into trianing/validation sets
        X_train_cv, X_val_cv = X[train_idx], X[val_idx]
        y_train_cv, y_val_cv = y[train_idx], y[val_idx]
        gsflow_val_cv = gsflow[val_idx]
        loading_val_cv = loading_class[val_idx]
        
        scaler_X = StandardScaler()
        X_train_cv_scaled = scaler_X.fit_transform(X_train_cv)
        X_val_cv_scaled = scaler_X.transform(X_val_cv)
        
        scaler_y = StandardScaler()
        y_train_cv_scaled = scaler_y.fit_transform(y_train_cv.reshape(-1, 1))
        
        # Define optimizer for SINDy
        optimizer = STLSQ(
            alpha=alpha,
            threshold=threshold,
            max_iter=10000,
            normalize_columns=True
        )
        
        # Train model
        model = SINDy(optimizer=optimizer)
        model.fit(X_train_cv_scaled, t=np.arange(len(y_train_cv_scaled)), 
                 x_dot=y_train_cv_scaled)
        
        # Compute performance on validation set 
        y_val_pred_scaled = model.predict(X_val_cv_scaled)
        y_val_pred_cv = scaler_y.inverse_transform(y_val_pred_scaled).flatten()
        
        acc, cm = calculate_accuracy(y_val_pred_cv, gsflow_val_cv, loading_val_cv, interval)
        acc_scores.append(acc)
    
    return np.mean(acc_scores)


In [7]:

def optimize_sindy_hyperparameters(X_train, y_train, gsflow_train, loading_train, param_grid):
    best_score = -1
    best_params = None
    
    X_train = np.array(X_train)
    y_train = np.array(y_train).flatten()
    gsflow_train = np.array(gsflow_train)
    loading_train = np.array(loading_train)
    
    print("Starting hyperparameter optimization...")
    for alpha in param_grid['alpha']:
        for threshold in param_grid['threshold']:
            for interval in param_grid['interval']:
                score = evaluate_sindy((alpha, threshold, interval), 
                                     X_train, y_train, gsflow_train, loading_train)
                
                if score > best_score:
                    best_score = score
                    best_params = {'alpha': alpha, 'threshold': threshold, 'interval': interval}
    
    return best_params, best_score

# Grid search for optimal hyperparameters 
param_grid = {
    'alpha': np.logspace(-4, 0, 10),      
    'threshold': np.logspace(-4, 0.25, 10),  
    'interval': np.linspace(0, 10, 10)  
}

In [8]:
# Run optimization using ONLY training data
best_params, best_score = optimize_sindy_hyperparameters(
    X_train, y_train, gsflow_train, loading_train, param_grid
)

print("\nBest parameters found:")
print(best_params)
print(f"Best CV accuracy: {best_score*100:.2f}%")

# Train final model on FULL training set with best params and evaluate on the final test set
final_optimizer = STLSQ(
    alpha=best_params['alpha'],
    threshold=best_params['threshold'],
    max_iter=10000,
    normalize_columns=True
)

final_model = SINDy(optimizer=final_optimizer)
final_model.fit(X_train_scaled, t=t_train, x_dot=y_train_scaled)

# Evaluate on trainin set
y_pred_train_scaled = final_model.predict(X_train_scaled)
y_pred_train = scaler_y.inverse_transform(y_pred_train_scaled).flatten()

# Evaluate on TEST set (previously unseen data)
y_pred_test_scaled = final_model.predict(X_test_scaled)
y_pred_test = scaler_y.inverse_transform(y_pred_test_scaled).flatten()

# Calculate metrics on TEST set
train_acc, train_cm = calculate_accuracy(y_pred_train, gsflow_train, loading_train, best_params['interval'])
test_acc, test_cm = calculate_accuracy(y_pred_test, gsflow_test, loading_test, best_params['interval'])

print("\n=== Final Model Performance ===")
final_model.print()
print(f"\nTraining Set Classification Accuracy: {train_acc*100:.2f}%")
print(f"Test Set Classification Accuracy: {test_acc*100:.2f}%")
print("Confusion Matrix for Test Set Classification")
print(test_cm)

Starting hyperparameter optimization...

Best parameters found:
{'alpha': 0.016681005372000592, 'threshold': 0.06812920690579609, 'interval': 0.0}
Best CV accuracy: 68.06%

=== Final Model Performance ===
(x0)' = -375093273090.210 1 + -109.192 x0 + 417301475450.859 x1 + -0.411 x2 + 5.974 x3 + 6.399 x4 + 401432552579.569 x5 + 1.324 x6 + -171.110 x7 + 121.712 x8 + -1.688 x0^2 + -3.595 x0 x1 + -0.367 x0 x2 + 1.015 x0 x3 + -1.721 x0 x4 + -3.586 x0 x5 + -1.211 x0 x6 + -88.999 x0 x7 + 2.397 x0 x8 + 1.094 x1^2 + 1.007 x1 x2 + -6.327 x1 x3 + 3.719 x1 x4 + -446604640816.048 x1 x5 + -0.483 x1 x6 + -4.796 x1 x7 + 2.595 x1 x8 + 2.622 x2^2 + -0.469 x2 x3 + 5.206 x2 x4 + 0.410 x2 x5 + 1.353 x2 x6 + 7.070 x2 x7 + -3.014 x2 x8 + -0.075 x3^2 + -7.505 x3 x4 + 0.639 x3 x5 + 1.038 x3 x6 + 1.961 x3 x7 + -1.030 x3 x8 + -0.106 x4^2 + 5.185 x4 x5 + 3.001 x4 x6 + -9.013 x4 x7 + 18.816 x4 x8 + 1.085 x5^2 + -1.147 x5 x6 + -4.170 x5 x7 + 1.433 x5 x8 + -0.269 x6^2 + 0.346 x6 x7 + 0.426 x7^2 + -0.703 x7 x8 + 0.459 