## Import Library

In [32]:
#!mlflow

In [33]:
# Data processing, JSON-handling, & visualization libraries
import joblib
import pandas as pd
import json
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns


# Sklearn libraries, preprocessing steps, & decision tree model
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

# Manual Grid Search
from sklearn.model_selection import (
    train_test_split,
    ParameterGrid
)

# XGBoost Classifier Algorithm
from xgboost import XGBClassifier
# MLflow Library Imports and OS Directory Handling
import os
from mlflow import (
    set_tracking_uri, get_tracking_uri, set_experiment, start_run, 
    log_params, log_metric
)
from mlflow.models.signature import infer_signature
import mlflow.sklearn

# Metric to track and log for MLflow
from sklearn.metrics import accuracy_score



In [34]:
# Load the *fitted* ColumnTransformer you saved earlier
preprocessor = joblib.load("../data/processed/14_processed_df.pkl")

# Rebuild X (same as in notebook A)
import pandas as pd, json
df = pd.read_csv("../data/processed/11_biz_merged_clean.csv")

In [35]:
preprocessor

0,1,2
,transformers,"[('cont', ...), ('ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,categories,"[[1, 2, ...], ['quiet', 'average', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,-1
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,False
,copy,True
,add_indicator,False
,keep_empty_features,False


Ordinary: 
PriceRange2: [1,2,3,4] → [0,1,2,3]
NoiseLevel: ["quiet","average","loud","very_loud"] → [0,1,2,3]
Attire: ["casual","dressy","formal"] → [0,1,2]

In [36]:
df.head()

Unnamed: 0,business_id,city,state,latitude,longitude,review_count,is_open,review_count_log1p,attr_ByAppointmentOnly,attr_BusinessAcceptsCreditCards,...,cat__Caterers,cat__Specialty Food,cat__Bakeries,cat__Desserts,rev_count_2019,avg_stars_2019,first_review_2019,last_review_2019,rl_word_mean,rl_share_short24
0,MTSW4McQd7CbVtyjqoe9mw,Philadelphia,PA,39.955505,-75.155564,80.0,True,4.394449,False,False,...,0,0,1,0,20,4.55,2019-03-12 17:04:09,2021-11-01 18:22:07,81.45,0.05
1,CF33F8-E6oudUQ46HnavjQ,Ashland City,TN,36.269593,-87.058943,6.0,True,1.94591,False,True,...,0,0,0,0,3,1.333333,2020-06-26 19:22:36,2021-03-06 07:18:00,70.0,0.0
2,bBDDEgkFA1Otx9Lfe7BZUQ,Nashville,TN,36.208102,-86.76817,10.0,True,2.397895,False,True,...,0,0,0,0,5,1.8,2019-01-05 01:28:55,2021-04-15 19:16:33,111.2,0.0
3,eEOYSgkmpB90uNA7lDOMRA,Tampa Bay,FL,27.955269,-82.45632,10.0,True,2.397895,,,...,0,0,0,0,8,4.25,2019-01-16 18:22:34,2022-01-03 01:18:29,91.875,0.0
4,il_Ro8jwPlHresjw9EGmBg,Indianapolis,IN,39.637133,-86.127217,28.0,True,3.367296,,True,...,0,0,0,0,12,2.25,2019-01-01 19:58:17,2021-04-22 13:58:42,97.833336,0.0


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36261 entries, 0 to 36260
Data columns (total 61 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   business_id                      36261 non-null  object 
 1   city                             36261 non-null  object 
 2   state                            36261 non-null  object 
 3   latitude                         36261 non-null  float64
 4   longitude                        36261 non-null  float64
 5   review_count                     36261 non-null  float64
 6   is_open                          36261 non-null  bool   
 7   review_count_log1p               36261 non-null  float64
 8   attr_ByAppointmentOnly           3139 non-null   object 
 9   attr_BusinessAcceptsCreditCards  31372 non-null  object 
 10  attr_BikeParking                 26853 non-null  object 
 11  attr_RestaurantsPriceRange2      29672 non-null  float64
 12  attr_RestaurantsTa

In [38]:
import pandas as pd
import numpy as np

# --- 1. Convert datetime columns ---
datetime_cols = ["first_review_2019", "last_review_2019"]
for col in datetime_cols:
    df[col] = pd.to_datetime(df[col], errors="coerce")

# --- 2. Convert boolean columns (True/False or Yes/No or t/f) ---
bool_cols = [
    "is_open", "attr_ByAppointmentOnly", "attr_BusinessAcceptsCreditCards",
    "attr_BikeParking", "attr_RestaurantsTakeOut", "attr_RestaurantsDelivery",
    "attr_Caters", "attr_WheelchairAccessible", "attr_HappyHour",
    "attr_OutdoorSeating", "attr_HasTV", "attr_RestaurantsReservations",
    "attr_DogsAllowed", "attr_GoodForKids", "attr_RestaurantsTableService",
    "attr_RestaurantsGoodForGroups", "attr_DriveThru", "has_hours_info"
]

for col in bool_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().replace(
            {"True": True, "False": False, "Yes": True, "No": False, "None": np.nan, "nan": np.nan}
        )
        df[col] = df[col].astype("boolean")

# --- 3. Convert category columns ---
category_cols = [
    "attr_RestaurantsPriceRange2", "attr_WiFi", "attr_Alcohol",
    "attr_RestaurantsAttire", "attr_NoiseLevel", "attr_Smoking"
]
for col in category_cols:
    if col in df.columns:
        df[col] = df[col].astype("category")

# --- 4. Convert small integer columns to int8 for memory efficiency ---
int8_cols = [
    "cat__Sandwiches", "cat__American (Traditional)", "cat__Pizza",
    "cat__Fast Food", "cat__Breakfast & Brunch", "cat__American (New)",
    "cat__Burgers", "cat__Mexican", "cat__Italian", "cat__Coffee & Tea",
    "cat__Seafood", "cat__Chinese", "cat__Salad", "cat__Chicken Wings",
    "cat__Cafes", "cat__Delis", "cat__Caterers", "cat__Specialty Food",
    "cat__Bakeries", "cat__Desserts"
]
for col in int8_cols:
    if col in df.columns:
        df[col] = df[col].astype("int8")

# --- 5. Convert others explicitly to float if not already ---
float_cols = [
    "latitude", "longitude", "review_count", "review_count_log1p",
    "total_weekly_hours", "days_open", "weekend_hours", "avg_daily_hours",
    "avg_stars_2019", "rl_word_mean", "rl_share_short24"
]
for col in float_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce").astype("float64")

# --- 6. Optional: compress integers like rev_count_2019 ---
df["rev_count_2019"] = df["rev_count_2019"].astype("int64")




  df[col] = df[col].astype(str).str.strip().replace(
  df[col] = df[col].astype(str).str.strip().replace(


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36261 entries, 0 to 36260
Data columns (total 61 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   business_id                      36261 non-null  object        
 1   city                             36261 non-null  object        
 2   state                            36261 non-null  object        
 3   latitude                         36261 non-null  float64       
 4   longitude                        36261 non-null  float64       
 5   review_count                     36261 non-null  float64       
 6   is_open                          36261 non-null  boolean       
 7   review_count_log1p               36261 non-null  float64       
 8   attr_ByAppointmentOnly           3139 non-null   boolean       
 9   attr_BusinessAcceptsCreditCards  31372 non-null  boolean       
 10  attr_BikeParking                 26853 non-null  boolean  

In [40]:
# Target
y = df["avg_stars_2019"].astype(float)

# Columns to exclude from predictors
exclude = {
    "business_id", "city", "state",
    "avg_stars_2019", "review_count",
    "rev_count_2019", "first_review_2019", "last_review_2019",
}

# Build X (everything except target + excluded)

feature_cols = [c for c in df.columns if c not in exclude]
X = df[feature_cols].copy()



In [41]:
#!pip install xgboost

## Define Target
Make a binary target (≥4★ = 1, else 0)

In [42]:
# Binary classification target
y_cls = (y >= 4.0).astype(int)
y_cls.value_counts(normalize=True).round(3)  # quick class balance check

avg_stars_2019
0    0.614
1    0.386
Name: proportion, dtype: float64

In [45]:
# Sklearn imports for optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# Partition predictors & response into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_cls,
    test_size = 0.2,   # Reserve 20% for "hold-out" data, 
    random_state=42
)

X_train.head(5)

Unnamed: 0,latitude,longitude,is_open,review_count_log1p,attr_ByAppointmentOnly,attr_BusinessAcceptsCreditCards,attr_BikeParking,attr_RestaurantsPriceRange2,attr_RestaurantsTakeOut,attr_RestaurantsDelivery,...,cat__Salad,cat__Chicken Wings,cat__Cafes,cat__Delis,cat__Caterers,cat__Specialty Food,cat__Bakeries,cat__Desserts,rl_word_mean,rl_share_short24
8442,38.738936,-90.397281,True,4.574711,,True,False,2.0,True,True,...,0,0,0,0,0,0,0,0,71.375,0.208333
5934,40.209943,-75.225566,True,4.812184,,True,True,2.0,True,True,...,1,0,0,0,0,0,0,0,106.0625,0.09375
31281,39.752035,-75.541795,True,2.079442,,True,False,1.0,True,True,...,0,0,0,0,0,0,0,0,66.0,0.0
18393,53.517787,-113.50945,True,4.025352,,,False,2.0,True,True,...,0,0,0,0,0,0,0,0,145.42857,0.0
21544,30.02008,-90.2506,True,2.397895,,True,False,1.0,True,True,...,0,0,0,0,0,0,0,0,65.85714,0.142857


## Configure MLflow Directory

In [46]:
# Configure this week into central ML repository for course
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

In [47]:
# Set experiment name for this module
set_experiment("BDA602 - yelp project - XGBoost")

<Experiment: artifact_location='mlflow-artifacts:/474447315718635531', creation_time=1759931587428, experiment_id='474447315718635531', last_update_time=1759931587428, lifecycle_stage='active', name='BDA602 - yelp project - XGBoost', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [48]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from scipy.stats import loguniform, randint, uniform
from xgboost import XGBClassifier
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd


## Manual Hyperparameter RandomizedSearchCV with MLflow Tracking

In [49]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from scipy.stats import loguniform, randint, uniform
from xgboost import XGBClassifier
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd

In [None]:


base_xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    tree_method="hist",      # or "gpu_hist" if you have a GPU
    n_jobs=-1,
    random_state =42
)

pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),     # your saved ColumnTransformer
    ("classifier", base_xgb)
])

# Randomized search over sensible ranges (sampled, not exhaustive)
param_distributions = {
    "classifier__n_estimators": randint(400, 1400),
    "classifier__learning_rate": loguniform(1e-2, 3e-1),  # ~0.01–0.3
    "classifier__max_depth": randint(3, 9),               # 3–8
    "classifier__min_child_weight": randint(1, 9),
    "classifier__subsample": uniform(0.6, 0.4),           # 0.6–1.0
    "classifier__colsample_bytree": uniform(0.6, 0.4),
    "classifier__reg_alpha": loguniform(1e-4, 5),
    "classifier__reg_lambda": loguniform(1e-1, 10),
    # "classifier__max_bin": randint(256, 513),           # only useful with hist; optional
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_distributions,
    n_iter=80,                # 50–100 is a good first pass
    scoring="roc_auc",        # prefer AUC over accuracy for imbalance
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42,
    refit=True                # refit on full training set with best params
)

with mlflow.start_run(run_name="xgb_random_search"):
    search.fit(X_train, y_train)

    # Evaluate once on the untouched test set
    from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
    y_proba = search.predict_proba(X_test)[:, 1]
    y_pred  = (y_proba >= 0.5).astype(int)

    metrics = {
        "test_auc": roc_auc_score(y_test, y_proba),
        "test_f1": f1_score(y_test, y_pred),
        "test_accuracy": accuracy_score(y_test, y_pred),
    }

    # Log best params + metrics + model
    mlflow.log_params(search.best_params_)
    for k, v in metrics.items():
        mlflow.log_metric(k, float(v))

    # Log model with signature
    from mlflow.models.signature import infer_signature
    # Use the pipeline’s input/output (after preprocessor). X_test is pandas DF already.
    signature = infer_signature(X_test, y_pred)
    mlflow.sklearn.log_model(search.best_estimator_, "model", signature=signature)

print("Best CV AUC:", search.best_score_)
print("Test metrics:", metrics)
print("Best params:", search.best_params_)
print(f"Test ROC AUC (unseen data): {metrics['test_auc']:.4f}")

Fitting 5 folds for each of 80 candidates, totalling 400 fits




🏃 View run xgb_random_search at: http://127.0.0.1:5000/#/experiments/474447315718635531/runs/3dc08c9062d648e583a12f4262456a44
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/474447315718635531
Best CV AUC: 0.8049220522922074
Test metrics: {'test_auc': 0.7915561296715169, 'test_f1': 0.6513136555303276, 'test_accuracy': 0.7035709361643457}
Best params: {'classifier__colsample_bytree': 0.6826737439506981, 'classifier__learning_rate': 0.025390649134098428, 'classifier__max_depth': 6, 'classifier__min_child_weight': 7, 'classifier__n_estimators': 701, 'classifier__reg_alpha': 0.00012179341618141621, 'classifier__reg_lambda': 0.1544848592475234, 'classifier__subsample': 0.8732027093665427}
Test ROC AUC (unseen data): 0.7916


## XGBoost Grid Search

In [51]:
# %% [markdown]
# ## XGBoost Grid Search (same format as randomized search)

# Grid (use classifier__* because we're tuning inside a Pipeline)
param_grid = {
    "classifier__n_estimators": [50, 200, 800],
    "classifier__max_depth": [3, 10],
    "classifier__learning_rate": [0.01, 0.1, 0.5],
}

from sklearn.model_selection import StratifiedKFold, cross_validate, ParameterGrid
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
import mlflow, mlflow.sklearn
from mlflow import start_run, log_params, log_metric
from mlflow.models.signature import infer_signature
import numpy as np, pandas as pd, time

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

all_rows = []
best_cv_auc = -np.inf
best_cfg = None
best_pipe = None

with start_run(run_name="xgb_grid_search"):
    for cfg in ParameterGrid(param_grid):
        t0 = time.time()

        # Build a fresh pipeline each loop (same base_xgb & preprocessor)
        # NOTE: 'pipe' already defined above with base_xgb; clone-like behavior is fine here
        # We just set params for this run:
        pipe.set_params(**cfg)

        # ----- CV on TRAIN only -----
        scores = cross_validate(
            pipe, X_train, y_train,
            scoring={"auc": "roc_auc", "acc": "accuracy", "f1": "f1"},
            cv=cv, n_jobs=-1, return_estimator=False
        )

        row = {
            **cfg,
            "cv_auc_mean": float(np.mean(scores["test_auc"])),
            "cv_auc_std":  float(np.std(scores["test_auc"])),
            "cv_acc_mean": float(np.mean(scores["test_acc"])),
            "cv_f1_mean":  float(np.mean(scores["test_f1"])),
            "fit_seconds": time.time() - t0,
        }

        # Fit on full TRAIN, then evaluate once on TEST
        pipe.set_params(**cfg)
        pipe.fit(X_train, y_train)
        y_proba = pipe.predict_proba(X_test)[:, 1]
        y_pred  = (y_proba >= 0.5).astype(int)

        row.update({
            "test_auc": roc_auc_score(y_test, y_proba),
            "test_accuracy": accuracy_score(y_test, y_pred),
            "test_f1": f1_score(y_test, y_pred),
        })
        all_rows.append(row)

        # ---- MLflow child run per config ----
        # Make a compact name that shows the region
        d = cfg["classifier__max_depth"]
        eta = cfg["classifier__learning_rate"]
        trees = cfg["classifier__n_estimators"]
        with start_run(run_name=f"grid_cfg:d{d}_eta{eta}_trees{trees}", nested=True):
            # Log only the 3 tuned params (clean) + fixed base for reproducibility
            log_params({
                "classifier__n_estimators": trees,
                "classifier__max_depth": d,
                "classifier__learning_rate": eta,
            })
            mlflow.log_params({
                "fixed__objective": base_xgb.get_xgb_params()["objective"],
                "fixed__eval_metric": base_xgb.get_xgb_params()["eval_metric"],
                "fixed__tree_method": base_xgb.get_xgb_params()["tree_method"],
                "fixed__scale_pos_weight": base_xgb.get_xgb_params().get("scale_pos_weight", None),
                "fixed__random_state": 42,
            })

            # CV metrics
            log_metric("cv_auc_mean", row["cv_auc_mean"])
            log_metric("cv_auc_std",  row["cv_auc_std"])
            log_metric("cv_accuracy_mean", row["cv_acc_mean"])
            log_metric("cv_f1_mean",  row["cv_f1_mean"])
            log_metric("fit_seconds", row["fit_seconds"])

            # Test metrics
            log_metric("test_auc", row["test_auc"])
            log_metric("test_accuracy", row["test_accuracy"])
            log_metric("test_f1", row["test_f1"])

            # Log candidate model (pipeline)
            signature = infer_signature(X_test, y_pred)
            mlflow.sklearn.log_model(pipe, "candidate_model", signature=signature)

        # Track best by CV AUC
        if row["cv_auc_mean"] > best_cv_auc:
            best_cv_auc = row["cv_auc_mean"]
            best_cfg = cfg.copy()
            # Keep a fitted copy for final printout
            best_pipe = Pipeline(pipe.steps, memory=pipe.memory)
            # best_pipe is already fitted above with this cfg

# --- Summary table (sorted by CV AUC) ---
df_results = pd.DataFrame(all_rows).sort_values("cv_auc_mean", ascending=False).reset_index(drop=True)
print("\n=== Grid Search Results (sorted by CV AUC) ===")
print(df_results[[
    "classifier__n_estimators", "classifier__max_depth", "classifier__learning_rate",
    "cv_auc_mean", "cv_acc_mean", "cv_f1_mean",
    "test_auc", "test_accuracy", "test_f1"
]].to_string(index=False))

# --- Final metrics for the best config ---
y_proba_best = best_pipe.predict_proba(X_test)[:, 1]
y_pred_best  = (y_proba_best >= 0.5).astype(int)
final_test_auc = roc_auc_score(y_test, y_proba_best)
final_test_acc = accuracy_score(y_test, y_pred_best)
final_test_f1  = f1_score(y_test, y_pred_best)

print("\n=== Best Configuration (by CV AUC) ===")
print(best_cfg)
print("\n=== Final Test Metrics (Best Config) ===")
print(f"Test ROC AUC (unseen data): {final_test_auc:.4f}")
print(f"Test Accuracy:              {final_test_acc:.4f}")
print(f"Test F1-score:              {final_test_f1:.4f}")





🏃 View run grid_cfg:d3_eta0.01_trees50 at: http://127.0.0.1:5000/#/experiments/474447315718635531/runs/e960cdd9451c4ae5b92f2af1404a7171
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/474447315718635531




🏃 View run grid_cfg:d3_eta0.01_trees200 at: http://127.0.0.1:5000/#/experiments/474447315718635531/runs/2c5c5978a3bd46cf9a164a5bb1755155
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/474447315718635531




🏃 View run grid_cfg:d3_eta0.01_trees800 at: http://127.0.0.1:5000/#/experiments/474447315718635531/runs/33186b8adeea4fe9be47351f6c6c361c
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/474447315718635531




🏃 View run grid_cfg:d10_eta0.01_trees50 at: http://127.0.0.1:5000/#/experiments/474447315718635531/runs/cde51e12f2e14944bc469ac28e4d72ff
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/474447315718635531




🏃 View run grid_cfg:d10_eta0.01_trees200 at: http://127.0.0.1:5000/#/experiments/474447315718635531/runs/bd99aacae481479bae28b4dd034bf2b5
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/474447315718635531




🏃 View run grid_cfg:d10_eta0.01_trees800 at: http://127.0.0.1:5000/#/experiments/474447315718635531/runs/c675d0ca49d84b959caee7743ef3f7bb
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/474447315718635531




🏃 View run grid_cfg:d3_eta0.1_trees50 at: http://127.0.0.1:5000/#/experiments/474447315718635531/runs/d939e9b5085140ca88c20920610d93eb
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/474447315718635531




🏃 View run grid_cfg:d3_eta0.1_trees200 at: http://127.0.0.1:5000/#/experiments/474447315718635531/runs/32aaa85b0dc24025b0861c9add221c5a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/474447315718635531




🏃 View run grid_cfg:d3_eta0.1_trees800 at: http://127.0.0.1:5000/#/experiments/474447315718635531/runs/e369306272404dac9d814f738ab45513
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/474447315718635531




🏃 View run grid_cfg:d10_eta0.1_trees50 at: http://127.0.0.1:5000/#/experiments/474447315718635531/runs/f8e2c198b6bc4eab95669ff17da34b46
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/474447315718635531




🏃 View run grid_cfg:d10_eta0.1_trees200 at: http://127.0.0.1:5000/#/experiments/474447315718635531/runs/ea0c68b4a27c4fc19a43dc2a6679f40a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/474447315718635531




🏃 View run grid_cfg:d10_eta0.1_trees800 at: http://127.0.0.1:5000/#/experiments/474447315718635531/runs/eb8b107403284d20b32cdeeb836d1c42
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/474447315718635531




🏃 View run grid_cfg:d3_eta0.5_trees50 at: http://127.0.0.1:5000/#/experiments/474447315718635531/runs/c90175a4746f47ecab92b8cd00c9d49f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/474447315718635531




🏃 View run grid_cfg:d3_eta0.5_trees200 at: http://127.0.0.1:5000/#/experiments/474447315718635531/runs/b181bbc01d42499087e573f1814ae20c
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/474447315718635531




🏃 View run grid_cfg:d3_eta0.5_trees800 at: http://127.0.0.1:5000/#/experiments/474447315718635531/runs/e8d73b9bc803478481b832d857a7814f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/474447315718635531




🏃 View run grid_cfg:d10_eta0.5_trees50 at: http://127.0.0.1:5000/#/experiments/474447315718635531/runs/778bfa2c89f2424792c4dbbbf0befd97
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/474447315718635531




🏃 View run grid_cfg:d10_eta0.5_trees200 at: http://127.0.0.1:5000/#/experiments/474447315718635531/runs/f58347f5c0fc4866aaedb8bb8def4fb9
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/474447315718635531




🏃 View run grid_cfg:d10_eta0.5_trees800 at: http://127.0.0.1:5000/#/experiments/474447315718635531/runs/2ac37a1fdd6f413997170b3eaace3e4f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/474447315718635531
🏃 View run xgb_grid_search at: http://127.0.0.1:5000/#/experiments/474447315718635531/runs/3507b7cb143b4e0391583a239ab5266d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/474447315718635531

=== Grid Search Results (sorted by CV AUC) ===
 classifier__n_estimators  classifier__max_depth  classifier__learning_rate  cv_auc_mean  cv_acc_mean  cv_f1_mean  test_auc  test_accuracy  test_f1
                      800                      3                       0.10     0.800896     0.722076    0.675130  0.789433       0.706604 0.653646
                      200                      3                       0.10     0.795598     0.711666    0.670084  0.785089       0.698470 0.651918
                      800                     10                       0.01     0.795550     

In [53]:
pipe

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cont', ...), ('ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,categories,"[[1, 2, ...], ['quiet', 'average', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,-1
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,False
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False
