In [1]:
# Model agnostic 
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import pandas as pd
from typing import Optional, List, Callable, Dict, Any, List
from pathlib import Path
from dft_utils import ChiefBldr  # custom model for data handling/model trianing

# Model specific 
from xgboost import XGBRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from typing import Optional, List 

In [2]:
# Get the directory this file lives in
nb_dir = Path.cwd() # notebook directory
project_root = nb_dir.parents[0] # project directory
data_path = project_root / "datasets" / "processed_well_data.csv"

drop_cols = ['Dia', 'Dev(deg)','Area (m2)', 'z','GasDens','LiquidDens', 'P/T','friction_factor', 'critical_film_thickness', 'Test status', 'Qcr', 'Gasflowrate', 'ΔQ']
D = ChiefBldr(path=data_path, drop_cols=drop_cols, test_size=0.20)

In [3]:
# define xgboost pipeline
def xgboost(
        hparams: Dict[str,Any]
) -> Pipeline:
    
    xgb = XGBRegressor(
        objective="reg:squarederror",
        random_state=42,
        importance_type="gain", 
        **hparams, # ** unravels to sets of key, value 
        )
    # 2) Wrap it in SelectFromModel
    selector = SelectFromModel(
        estimator=xgb,
        threshold="mean",                # keep features with importance ≥ mean importance
        prefit=False                     # will fit selector inside the pipeline
    )

    # 3) Build a pipeline
    pipe = Pipeline([
        ("feature_sel", selector),
        ("model",       xgb),
    ])

    return pipe

hparam_grid = {
            "n_estimators":   [25, 40, 50],
            "learning_rate":  [0.01, 0.05, 0.1],
            "max_depth":      [10, 15, 50],
        }
# train model and optimize hyperparameters via grid search 
trained_model = D.evolv_model(build_model=xgboost, hparam_grid=hparam_grid, k_folds=5)

mask = trained_model.named_steps["feature_sel"].get_support()  
#    ↑ this is a 1d array of True/False of length n_features

# index into column names
selected_features = D.X.columns[mask]

print("Features kept by SelectFromModel:")
print(selected_features.tolist())

Training model and optimizing hyperparameters via k-fold CV...
Done. Best score = 0.7885918003565063
Best hyperparameters: {'n_estimators': 50, 'learning_rate': 0.1, 'max_depth': 15}
Retraining optimized model on full training set
Training set score: 0.9457831325301205
Test set score: 0.8095238095238095
Features kept by SelectFromModel:
['LiquidFlowrate', 'Vsg', 'Vsl', 'Reg', 'd(0,90)', 'd(120,90)']
