In [7]:
import numpy as np
from typing import Union
import pandas as pd
import streamlit as st
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score

In [2]:
def auto_problem_type(y:pd.Series)-> str:
    if y.dtype=="object" or str(y.dtype).startswith("category"):
        return "classification"
    nunique=y.nunique(dropna=True)
    if nunique<=20:
        return "classification"
    return "regression"

In [8]:
def prepare_features(df:pd.DataFrame, target_col:str)->tuple[pd.DataFrame, Union[pd.Series, np.ndarray],str,dict]:
    X=df.drop(columns=[target_col])
    y=df[target_col]
    problem=auto_problem_type(y)
    
    cat_cols=X.select_dtypes(include=["object","category"]).columns.tolist()
    X_proc = pd.get_dummies(X, columns=cat_cols, dummy_na=False)

    encoders={}
    if problem=="classification":
        le=LabelEncoder()
        y_enc=le.fit_transform(y)
        encoders["label_encoder"]=le
    else:
        y_enc=y.astype(float).values
    return X_proc, y_enc, problem, encoders

In [12]:
def train_xgb(X: pd.DataFrame, y:Union[pd.Series, np.ndarray], problem:str, params:dict):
    if problem=="classification":
        model=xgb.XGBClassifier(
            n_estimators=params.get("n_estimators", 100),
            max_depth=params.get("max_depth",4),
            learning_rate=params.get("learning_rate",0.1),
            colsample_bytree=params.get("subsample",1.0),
            random_state=params.get("random_state",42),
            n_jobs=-1,
            tree_method=params.get("tree_method","auto"),
            reg_lambda=params.get("reg_lambda",1.0),
        )
    else:
        model = xgb.XGBRegressor(
            n_estimators=params.get("n_estimators", 200),
            max_depth=params.get("max_depth", 5),
            learning_rate=params.get("learning_rate", 0.07),
            subsample=params.get("subsample", 1.0),
            colsample_bytree=params.get("colsample_bytree", 1.0),
            random_state=params.get("random_state", 42),
            n_jobs=-1,
            tree_method=params.get("tree_method", "auto"),
            reg_lambda=params.get("reg_lambda", 1.0),
        )

    model.fit(X,y)
    booster=model.get_booster()
    booster.feature_names=list(X.columns)
    booster.feature_types=None
    return model


In [13]:
def eval_mode(model, X_test, y_test, problem:str):
    y_pred=model.predict(X_test)
    if problem=="classification":
        acc=accuracy_score(y_test,y_pred)
        f1=f1_score(y_test, y_pred, average="weighted")
        return {"accuracy": acc, "f1_weighted": f1}
    else:
        rmse=mean_squared_error(y_test, y_pred, squared=False)
        r2=r2_score(y_test, y_pred)
        return {"rmse":rmse, "r2": r2}


In [14]:
def get_num_trees(model) -> int:
    booster = model.get_booster()
    return len(booster.get_dump())