In [14]:
# ===============================================
# STEP 0 — Imports, config, and speed knobs
# ===============================================
import warnings, time, numpy as np, pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from mlflow.models.signature import infer_signature
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, confusion_matrix,
    classification_report, precision_recall_curve
)
import joblib

# ---- paths / label (EDIT as needed) ----
CSV_PATH     = "../data/processed/11_biz_merged_clean.csv"
PKL_PATH    = "../data/processed/14_processed_df.pkl"


TARGET_COL   = "avg_stars_2019"
POSITIVE_MIN = 4  # binarize: >= 4 stars => 1

# ---- speed knobs ----
CV_FOLDS             = 5
EN_N_ITER            = 12
RF_N_ITER            = 16
SUBSAMPLE_FOR_TUNING = None   # e.g. 25000 to tune on subset, then refit on full; or None
OHE_MIN_FREQ         = 20     # merge rare levels to reduce feature count
SEARCH_N_JOBS        = -1
VERBOSE              = 2

warnings.filterwarnings("ignore", message="Skipping features without any observed values")

In [15]:
# ===============================================
# STEP 1 — Load data
# ===============================================
df = pd.read_csv(CSV_PATH)
print("Loaded:", df.shape)

Loaded: (36261, 61)


In [16]:
# Load the *fitted* ColumnTransformer you saved earlier
preprocessor = joblib.load(PKL_PATH)

In [17]:
# Configure this week into central ML repository for course
import mlflow
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
# Set experiment name for this module
mlflow.set_experiment("BDA602 Yelp project")

<Experiment: artifact_location='mlflow-artifacts:/786926549055850120', creation_time=1759981678118, experiment_id='786926549055850120', last_update_time=1759981678118, lifecycle_stage='active', name='BDA602 Yelp project', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [18]:
preprocessor

0,1,2
,transformers,"[('cont', ...), ('ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,categories,"[[1, 2, ...], ['quiet', 'average', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,-1
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,False
,copy,True
,add_indicator,False
,keep_empty_features,False


## Coercing

In [19]:
import pandas as pd
import numpy as np

# --- 1. Convert datetime columns ---
datetime_cols = ["first_review_2019", "last_review_2019"]
for col in datetime_cols:
    df[col] = pd.to_datetime(df[col], errors="coerce")

# --- 2. Convert boolean columns (True/False or Yes/No or t/f) ---
bool_cols = [
    "is_open", "attr_ByAppointmentOnly", "attr_BusinessAcceptsCreditCards",
    "attr_BikeParking", "attr_RestaurantsTakeOut", "attr_RestaurantsDelivery",
    "attr_Caters", "attr_WheelchairAccessible", "attr_HappyHour",
    "attr_OutdoorSeating", "attr_HasTV", "attr_RestaurantsReservations",
    "attr_DogsAllowed", "attr_GoodForKids", "attr_RestaurantsTableService",
    "attr_RestaurantsGoodForGroups", "attr_DriveThru", "has_hours_info"
]

for col in bool_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().replace(
            {"True": True, "False": False, "Yes": True, "No": False, "None": np.nan, "nan": np.nan}
        )
        df[col] = df[col].astype("boolean")

# --- 3. Convert category columns ---
category_cols = [
    "attr_RestaurantsPriceRange2", "attr_WiFi", "attr_Alcohol",
    "attr_RestaurantsAttire", "attr_NoiseLevel", "attr_Smoking"
]
for col in category_cols:
    if col in df.columns:
        df[col] = df[col].astype("category")

# --- 4. Convert small integer columns to int8 for memory efficiency ---
int8_cols = [
    "cat__Sandwiches", "cat__American (Traditional)", "cat__Pizza",
    "cat__Fast Food", "cat__Breakfast & Brunch", "cat__American (New)",
    "cat__Burgers", "cat__Mexican", "cat__Italian", "cat__Coffee & Tea",
    "cat__Seafood", "cat__Chinese", "cat__Salad", "cat__Chicken Wings",
    "cat__Cafes", "cat__Delis", "cat__Caterers", "cat__Specialty Food",
    "cat__Bakeries", "cat__Desserts"
]
for col in int8_cols:
    if col in df.columns:
        df[col] = df[col].astype("int8")

# --- 5. Convert others explicitly to float if not already ---
float_cols = [
    "latitude", "longitude", "review_count", "review_count_log1p",
    "total_weekly_hours", "days_open", "weekend_hours", "avg_daily_hours",
    "avg_stars_2019", "rl_word_mean", "rl_share_short24"
]
for col in float_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce").astype("float64")

# --- 6. Optional: compress integers like rev_count_2019 ---
df["rev_count_2019"] = df["rev_count_2019"].astype("int64")


  df[col] = df[col].astype(str).str.strip().replace(
  df[col] = df[col].astype(str).str.strip().replace(


## Define Target and Predictor

In [20]:
# Target
y = df["avg_stars_2019"].astype(float)

# Columns to exclude from predictors
exclude = {
    "business_id", "city", "state",
    "avg_stars_2019", "review_count",
    "rev_count_2019", "first_review_2019", "last_review_2019",
}

# Build X (everything except target + excluded)

feature_cols = [c for c in df.columns if c not in exclude]
X = df[feature_cols].copy()

In [21]:
# Binary classification target
y_cls = (y >= 4.0).astype(int)
y_cls.value_counts(normalize=True).round(3)  # quick class balance check

avg_stars_2019
0    0.614
1    0.386
Name: proportion, dtype: float64

## Split X train and y train

In [22]:
# Sklearn imports for optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# Partition predictors & response into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_cls,
    test_size = 0.2,   # Reserve 20% for "hold-out" data, 
    random_state=42
)

X_train.head(5)

Unnamed: 0,latitude,longitude,is_open,review_count_log1p,attr_ByAppointmentOnly,attr_BusinessAcceptsCreditCards,attr_BikeParking,attr_RestaurantsPriceRange2,attr_RestaurantsTakeOut,attr_RestaurantsDelivery,...,cat__Salad,cat__Chicken Wings,cat__Cafes,cat__Delis,cat__Caterers,cat__Specialty Food,cat__Bakeries,cat__Desserts,rl_word_mean,rl_share_short24
8442,38.738936,-90.397281,True,4.574711,,True,False,2.0,True,True,...,0,0,0,0,0,0,0,0,71.375,0.208333
5934,40.209943,-75.225566,True,4.812184,,True,True,2.0,True,True,...,1,0,0,0,0,0,0,0,106.0625,0.09375
31281,39.752035,-75.541795,True,2.079442,,True,False,1.0,True,True,...,0,0,0,0,0,0,0,0,66.0,0.0
18393,53.517787,-113.50945,True,4.025352,,,False,2.0,True,True,...,0,0,0,0,0,0,0,0,145.42857,0.0
21544,30.02008,-90.2506,True,2.397895,,True,False,1.0,True,True,...,0,0,0,0,0,0,0,0,65.85714,0.142857


In [23]:
# ===============================================
# STEP 5 — Elastic Net (fast search) + STEP 5B refit
# ===============================================
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

pipe_en = SkPipeline([
    ("preprocessor", preprocessor),
    ("clf", LogisticRegression(
        penalty="elasticnet", solver="saga",
        max_iter=2000, tol=1e-3,  # looser for search
        class_weight="balanced", n_jobs=-1,
        random_state=42
    )),
])

param_dist_en = {
    "clf__C": np.logspace(-2, 0, 5).tolist(),   # 0.01 .. 1.0
    "clf__l1_ratio": [0.2, 0.4, 0.6, 0.8],
}


t0 = time.time()
rs_en = RandomizedSearchCV(
    estimator=pipe_en,
    param_distributions=param_dist_en,
    n_iter=EN_N_ITER,
    scoring={"f1": "f1", "roc_auc": "roc_auc"},
    refit="f1",
    cv=5,
    n_jobs=SEARCH_N_JOBS,
    verbose=VERBOSE,
    random_state=42,
    error_score="raise"
)

rs_en.fit(X_train, y_train)

grid_search_cv = rs_en


# ---- Best CV metrics
best_index   = grid_search_cv.best_index_
best_params  = grid_search_cv.best_params_
best_cv_f1   = float(grid_search_cv.best_score_)  # because refit="f1"
best_cv_auc  = float(grid_search_cv.cv_results_["mean_test_roc_auc"][best_index])

print(f"[EN] Done. Best params: {best_params}")
print(f"[EN] Best CV F1:       {best_cv_f1:.3f}")
print(f"[EN] Best CV ROC AUC:  {best_cv_auc:.3f}")


# ---- Refit best on full training set with tighter convergence
pipe_en.set_params(**best_params)
pipe_en.set_params(clf__max_iter=5000, clf__tol=1e-4)
pipe_en.fit(X_train, y_train)
print("[EN] Refit complete.")





Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END .....................clf__C=0.01, clf__l1_ratio=0.2; total time=   0.6s
[CV] END .....................clf__C=0.01, clf__l1_ratio=0.2; total time=   0.6s
[CV] END .....................clf__C=0.01, clf__l1_ratio=0.2; total time=   0.5s
[CV] END .....................clf__C=0.01, clf__l1_ratio=0.2; total time=   0.5s
[CV] END .....................clf__C=0.01, clf__l1_ratio=0.2; total time=   0.5s
[CV] END ......clf__C=0.31622776601683794, clf__l1_ratio=0.8; total time=   0.4s
[CV] END ......................clf__C=1.0, clf__l1_ratio=0.4; total time=   0.9s
[CV] END ......................clf__C=1.0, clf__l1_ratio=0.4; total time=   0.9s
[CV] END ......................clf__C=1.0, clf__l1_ratio=0.4; total time=   0.9s
[CV] END ......................clf__C=1.0, clf__l1_ratio=0.4; total time=   0.8s
[CV] END ......................clf__C=1.0, clf__l1_ratio=0.4; total time=   0.9s
[CV] END ......clf__C=0.31622776601683794, clf__

In [None]:
# ---- Evaluate on test set
y_pred  = pipe_en.predict(X_test)
y_proba = pipe_en.predict_proba(X_test)[:, 1]

metrics = {
    "test_auc": float(roc_auc_score(y_test, y_proba)),
    "test_f1": float(f1_score(y_test, y_pred)),
    "test_accuracy": float(accuracy_score(y_test, y_pred)),
    # useful extras:
    "cv_best_f1": best_cv_f1,
    "cv_best_auc": best_cv_auc,
}

# ---- Log to MLflow in your requested format
with mlflow.start_run(run_name="elasticnet_randomsearch"):
    # Log best params in SAME shape you tuned (here: clf__*)
    mlflow.log_params(grid_search_cv.best_params_)

    # Log metrics with consistent keys
    for k, v in metrics.items():
        mlflow.log_metric(k, v)

    # Log the fitted pipeline as "model"
    signature = infer_signature(X_test, y_pred)  # input -> predicted labels
    mlflow.sklearn.log_model(pipe_en, "model", signature=signature)

print("Best CV AUC:", best_cv_auc)
print("Test metrics:", metrics)
print("Best params:", grid_search_cv.best_params_)
print(f"Test ROC AUC (unseen data): {metrics['test_auc']:.4f}")



🏃 View run elasticnet_randomsearch at: http://127.0.0.1:5000/#/experiments/786926549055850120/runs/c04080ff3fa44e06a6775e6972a9414b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/786926549055850120
Best CV AUC: 0.7679199505727505
Test metrics: {'test_auc': 0.760826966289228, 'test_f1': 0.632473253618628, 'test_accuracy': 0.6779263752929822, 'cv_best_f1': 0.646851567632533, 'cv_best_auc': 0.7679199505727505}
Best params: {'clf__l1_ratio': 0.6, 'clf__C': 1.0}
Test ROC AUC (unseen data): 0.7608


In [25]:
pipe_en

0,1,2
,steps,"[('preprocessor', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cont', ...), ('ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,categories,"[[1, 2, ...], ['quiet', 'average', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,-1
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,False
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,penalty,'elasticnet'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'saga'
,max_iter,5000


In [26]:
# ===============================================
# STEP 6 — Evaluate Elastic Net (best-F1 threshold)
# ===============================================
def best_f1_threshold(y_true, probas):
    pr, rc, thr = precision_recall_curve(y_true, probas)
    f1s = 2 * (pr[:-1] * rc[:-1]) / (pr[:-1] + rc[:-1] + 1e-12)
    i = int(np.nanargmax(f1s))
    return float(thr[i])

proba_train_en = pipe_en.predict_proba(X_train)[:, 1]
proba_test_en  = pipe_en.predict_proba(X_test)[:, 1]
thr_en = best_f1_threshold(y_train, proba_train_en)

y_pred_en = (proba_test_en >= thr_en).astype(int)
acc_en = accuracy_score(y_test, y_pred_en)
f1_en  = f1_score(y_test, y_pred_en)
auc_en = roc_auc_score(y_test, proba_test_en)
cm_en  = confusion_matrix(y_test, y_pred_en)

print("\n=== ElasticNet-LogReg (test) ===")
print(f"Threshold: {thr_en:.3f}")
print(f"Accuracy:  {acc_en:.3f} | F1: {f1_en:.3f} | ROC-AUC: {auc_en:.3f}")
print("Confusion matrix (TN FP / FN TP):\n", cm_en)
print("\n" + classification_report(y_test, y_pred_en, digits=3))

en_results = {"model":"ElasticNet-LogReg","threshold":thr_en,"accuracy":acc_en,"f1":f1_en,"roc_auc":auc_en}




=== ElasticNet-LogReg (test) ===
Threshold: 0.430
Accuracy:  0.650 | F1: 0.640 | ROC-AUC: 0.761
Confusion matrix (TN FP / FN TP):
 [[2451 2060]
 [ 482 2260]]

              precision    recall  f1-score   support

           0      0.836     0.543     0.659      4511
           1      0.523     0.824     0.640      2742

    accuracy                          0.650      7253
   macro avg      0.679     0.684     0.649      7253
weighted avg      0.718     0.650     0.652      7253

