In [1]:
# Model agnostic 
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import pandas as pd
from dft_utils import DataMstr  # run pip install . in home directory if modulle DNE

# Model specific 
from xgboost import XGBRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline

# Data preparation 

In [2]:
path = "/Users/deanmsweeney/Documents/UM-Google Drive/Classes/ChE 696/project/symbolic-machine-learning-liquidloading/datasets/processed_well_data.csv"
drop_cols = ['Dia', 'Dev(deg)','Area (m2)', 'z','GasDens','LiquidDens', 'P/T','friction_factor', 'critical_film_thickness', 'Test status']
D = DataMstr(path=path, drop_cols=drop_cols)
D.split_data()

In [3]:
# define xgboost pipeline
def xgboost():
   
   # list hyperparameters 

    xgb = XGBRegressor(
        objective="reg:squarederror",
        n_estimators=200,
        learning_rate=0.1,
        random_state=42,
        importance_type="gain"           # use split‑gain as importance metric
        )

    # 2) Wrap it in SelectFromModel
    selector = SelectFromModel(
        estimator=xgb,
        threshold="mean",                # keep features with importance ≥ mean importance
        prefit=False                     # will fit selector inside the pipeline
    )

    # 3) Build a pipeline
    pipe = Pipeline([
        ("feature_sel", selector),
        ("model",       xgb),
    ])

    avg_acc = D.custom_CV(model=pipe, k_folds=5) # activates for loop for k-fold cross validation 

    return avg_acc

xgboost()


0.9397504456327986

In [None]:

def optimize_sindy_hyperparameters(X_train, y_train, gsflow_train, loading_train, param_grid):
    best_score = -1
    best_params = None
    
    X_train = np.array(X_train)
    y_train = np.array(y_train).flatten()
    gsflow_train = np.array(gsflow_train)
    loading_train = np.array(loading_train)
    
    print("Begin training and hyperparameter optimization...")
    for alpha in param_grid['alpha']:
        for threshold in param_grid['threshold']:
            for interval in param_grid['interval']:
                for n in param_grid['n']:
                    for f in param_grid['f']:
                        score = evaluate_sindy((alpha, threshold, interval, n, f), 
                                            X_train, y_train, gsflow_train, loading_train)
                    
                    if score > best_score:
                        best_score = score
                        best_params = {'alpha': alpha, 'threshold': threshold, 'interval': interval, 'n': n}
    
    return best_params, best_score