In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import train_test_split
from pysindy import SINDy
from sklearn.preprocessing import StandardScaler
from pysindy.optimizers import STLSQ
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold 
from pysindy.feature_library import PolynomialLibrary, FourierLibrary, GeneralizedLibrary

# Model agnostic 
from typing import Optional, List, Callable, Dict, Any, List
from pathlib import Path
from itertools import islice
from dft_utils import ChiefBldr  # custom model for data handling/model trianing

# Model specific 
from typing import Optional, List 

In [2]:
# Get the directory this file lives in
nb_dir = Path.cwd() # notebook directory
project_root = nb_dir.parents[0] # project directory
data_path = project_root / "datasets" / "processed_well_data.csv"

includ_cols = ['Dia', 'Dev(deg)','Area (m2)', 'z','GasDens','LiquidDens', 'P/T','friction_factor', 'critical_film_thickness']
D = ChiefBldr(path=data_path, includ_cols=includ_cols, test_size=0.20)

In [3]:
def sindy(
        hparams: Dict[str,Any]
):      
        # partition dict by method
        hparams_opt = dict(list(hparams.items())[:2])
        hparams_poly = dict(list(hparams.items())[-2:-1])
        hparams_fourier = dict(list(hparams.items())[-1:])
        
        # Define optimizer for SINDy
        hparams_opt = dict(islice(hparams.items(), 2))
        optimizer = STLSQ(
        max_iter=10000,
        normalize_columns=True,
        **hparams_opt,
        )
        # specify feature lib
        poly_library = PolynomialLibrary(**hparams_poly)
        fourier_library = FourierLibrary(**hparams_fourier)
        lib = GeneralizedLibrary([poly_library, fourier_library])
        model = SINDy(optimizer=optimizer, feature_library=lib)

        return model 

hparam_grid = {
    'alpha': np.logspace(-4, 0.25, 10),      
    'threshold': np.logspace(-4, -1, 10),  
    'degree': [1, 2, 3, 4],
    'n_frequencies': [1, 2, 3, 4]
}

# train model and optimize hyperparameters via grid search 
trained_model = D.evolv_model(build_model=sindy, hparam_grid=hparam_grid, k_folds=5)


Training model and optimizing hyperparameters via k-fold CV...
Done. Best score = 0.7463458110516934
Best hyperparameters: {'alpha': 1.7782794100389228, 'threshold': 0.1, 'degree': 2, 'n_frequencies': 2}
Retraining optimized model on full training set
Training set score: 0.7650602409638554
Test set score: 0.6666666666666666
