# CatBoost Model: Drift & Segmentation Analysis
This notebook demonstrates how to train a CatBoost model, analyze drift, and perform segmentation analysis with interactive Plotly visualizations using the `tab-right` package.

In [None]:
# Install dependencies if running in Colab or a fresh environment
# !pip install catboost plotly pandas scikit-learn tab-right

In [None]:
import plotly.io as pio
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

from tab_right.plotting import plot_segmentations
from tab_right.segmentations.base import SegmentationStats

pio.renderers.default = "notebook"

## Load Example Dataset
We'll use the UCI Adult dataset (census income) from OpenML.

In [None]:
from sklearn.datasets import fetch_openml

data = fetch_openml("adult", version=2, as_frame=True)
df = data.frame.copy()
df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle
df = df.dropna()  # Drop missing for simplicity
df["target"] = (df["class"] == ">50K").astype(int)
df = df.drop(columns=["class"])
df.head()

## Split Data: Reference vs. Current
We'll simulate drift by splitting the data by time (first 70% as reference, last 30% as current).

In [None]:
split_idx = int(0.7 * len(df))
df_ref = df.iloc[:split_idx].reset_index(drop=True)
df_cur = df.iloc[split_idx:].reset_index(drop=True)
print(f"Reference: {df_ref.shape}, Current: {df_cur.shape}")

## Train CatBoost Model
We'll train on the reference data and predict on the current data.

In [None]:
cat_features = df_ref.select_dtypes(include="category").columns.tolist() + [
    col for col in df_ref.columns if df_ref[col].dtype == "object"
]
cat_features = list(set(cat_features) - set(["target"]))
X_ref = df_ref.drop(columns=["target"])
y_ref = df_ref["target"]
X_cur = df_cur.drop(columns=["target"])
y_cur = df_cur["target"]
model = CatBoostClassifier(cat_features=cat_features)
model.fit(X_ref, y_ref)
y_pred = model.predict(X_cur)
print("Accuracy on current:", accuracy_score(y_cur, y_pred))

## Segmentation Analysis
Let's segment the predictions by a feature (e.g., 'education-num') and visualize the results.

In [None]:
seg_df = df_cur.copy()
seg_df["pred"] = y_pred
seg = SegmentationStats(
    seg_df, label_col="target", pred_col="pred", feature="education-num", metric=accuracy_score, is_categorical=False
)
seg_result = seg.run(bins=8)
seg_result

### Plot Segmentation Results

In [None]:
fig = plot_segmentations(seg_result)
fig.show()