# Banana Dataset

- [Link to dataset](https://sci2s.ugr.es/keel/dataset.php?cod=182)
- [Link to PMLB](https://epistasislab.github.io/pmlb/profile/banana.html)

binary classification


In [None]:
import json
from pathlib import Path
import altair as alt
import numpy as np
import lightgbm as lgb
import pandas as pd
from pmlb import fetch_data
from sklearn.model_selection import train_test_split
from pdpilot import PDPilotWidget

In [None]:
def sample(df, n, objective):
    """Stratified sample for binary datasets, random for regression."""
    if objective == "binary":
        return train_test_split(
            df, train_size=n, random_state=1, stratify=df["target"]
        )[0]
    else:
        return df.sample(n, random_state=1)


def load_dataset(dataset_info, datasets_dir):
    "Download the dataset."

    dataset = dataset_info["name"]
    objective = dataset_info["objective"]
    exclude_features = dataset_info["exclude_features"]

    df_all = fetch_data(dataset_info["name"], local_cache_dir=datasets_dir.as_posix())

    df_reduced = (
        df_all if df_all.shape[0] <= 200_000 else sample(df_all, 200_000, objective)
    )

    df_X = df_reduced.drop(columns=["target"] + exclude_features)
    y = df_reduced["target"].to_numpy()

    # drop columns that only have one unique value
    nunique = df_X.nunique()
    df_X.drop(columns=nunique[nunique == 1].index, inplace=True)

    features = list(df_X.columns)
    nominal_features = [f for f in dataset_info["nominal_features"] if f in features]

    # convert float columns that contain only integers to integers
    for feature in features:
        as_int = df_X[feature].astype(int)
        if np.array_equal(df_X[feature], as_int):
            df_X[feature] = as_int

    X = df_X.to_numpy()

    return dataset, objective, df_X, X, y, features, nominal_features

In [None]:
dataset_group = "big"
dataset = "banana"

datasets = json.loads(Path("../../data/datasets.json").read_bytes())
datasets_dir = Path(f"../../data/results/{dataset_group}/datasets")
dataset_info = [ds for ds in datasets[dataset_group] if ds["name"] == dataset][0]

booster = lgb.Booster(
    model_file=f"../../data/results/{dataset_group}/models/{dataset}.txt"
)
pd_data = Path(f"../../data/results/{dataset_group}/pdpilot/{dataset}.json")
stuff = json.loads(
    Path(f"../../data/results/{dataset_group}/stuff/{dataset}.json").read_bytes()
)
importances = pd.read_csv(
    f"../../data/results/{dataset_group}/importances/{dataset}.csv"
)

dataset, objective, df_X, X, y, features, nominal_features = load_dataset(
    dataset_info, datasets_dir
)

df_Xy = df_X.copy()
df_Xy["target"] = y

df_Xy_sample = df_Xy if df_Xy.shape[0] <= 2000 else sample(df_Xy, 2000, objective)

df_pd = df_Xy_sample.drop(columns=["target"])
y_pd = df_Xy_sample["target"].to_numpy()

In [None]:
assert list(df_pd.index) == stuff["pdpilot_indices"]

In [None]:
w = PDPilotWidget(
    predict=booster.predict, df=df_pd, labels=y_pd, pd_data=pd_data, seed=56, height=650
)

w

In [None]:
importances

In [None]:
alt.Chart(importances).mark_bar().encode(
    y=alt.Y("feature").sort("-x"),
    x=alt.X(alt.repeat("row"), type="quantitative"),
    fill=alt.Fill("feature").legend(None),
).repeat(row=["score_ice", "score_pdp", "score_perm", "score_shap", "score_lgb"])

In [None]:
stuff["cv_results"]["mean_score"]