# XGBoost and Ordinal Encoding

In [1]:
import pathlib
import pprint
import warnings

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import metrics, preprocessing

from ml_misc import feature_engineering
from ml_misc import run_model

In [2]:
def run_ordinal(fold_id: int) -> float:
    df = pd.read_csv("../data/adult_5_folds.csv")

    num_vars = [
        "Unnamed: 0",
        "age",
    ]
    df = df.drop(num_vars, axis=1)

    target_mapping = {
        "<=50K": 0,
        ">50K": 1,
    }
    df.loc[:, "income"] = df.income.map(target_mapping)
    features = [
        col for col in df.columns if col not in ("fold_id", "income")
    ]

    for feature in features:
        df.loc[:, feature] = df[feature].astype(str).fillna("NONE")

    for feature in features:
        enc = preprocessing.OrdinalEncoder()
        enc.fit(df.loc[:, feature].values.reshape(-1, 1))
        df.loc[:, feature] = enc.transform(df.loc[:, feature].values.reshape(-1, 1))

    df_train = df[df.fold_id != fold_id].reset_index(drop=True)
    df_valid = df[df.fold_id == fold_id].reset_index(drop=True)

    X_train = df_train.loc[:, features].values
    X_valid = df_valid.loc[:, features].values
    y_train = df_train.loc[:, "income"].values.astype(np.int64)
    y_valid = df_valid.loc[:, "income"].values.astype(np.int64)
    
    model = xgb.XGBClassifier(n_jobs=-1)
    model.fit(X_train, y_train)
    
    valid_preds = model.predict_proba(X_valid)[:, 1]
    auc = metrics.roc_auc_score(y_valid, valid_preds)
    print(f"Fold = {fold_id}, AUC = {auc}")
    return auc

I abstracted all this out into a universal `train_cv` function.

In [3]:
NON_VARS = ["Unnamed: 0"]
NUM_VARS = ["age", "fnlwgt", "educational-num", "capital-gain", "hours-per-week",]
TARGET_VAR = "income"
DEFAULT_MODEL_HYPERPARAMS = {
    "n_jobs": -1,
}
TARGET_MAPPING = {
    "<=50K": 0,
    ">50K": 1,
}

In [6]:
pprint.pprint(categorical_ordinal)

{0: 0.8870640689898823,
 1: 0.8844360704739664,
 2: 0.883638042232153,
 3: 0.8923951732346085,
 4: 0.8902355529090424}


In [5]:
categorical_ordinal = {}

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for fold_id in range(5):
        df = pd.read_csv("../data/adult_5_folds.csv")
        auc = run_model.train_cv(
            df=df,
            fold_id=fold_id,
            non_vars=[*NON_VARS, *NUM_VARS],
            num_vars=[],
            cat_vars=[column for column in df.columns if column not in (*NON_VARS, *NUM_VARS, TARGET_VAR)],
            target_var=TARGET_VAR,
            model_class=xgb.XGBClassifier,
            model_hyperparams=DEFAULT_MODEL_HYPERPARAMS,
            target_mapping=TARGET_MAPPING,
        )
        categorical_ordinal[fold_id] = auc
        del df

## Changing some hyperparameters
(Yes, this code is nowhere near as modular as it could be. I would clean it up for serious production purposes.)

In [11]:
diff_hps = {}
hyperparams = {
    "n_estimators": 200, "max_depth": 7, **DEFAULT_MODEL_HYPERPARAMS,
}

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for fold_id in range(5):
        df = pd.read_csv("../data/adult_5_folds.csv")
        auc = run_model.train_cv(
            df=df,
            fold_id=fold_id,
            non_vars=NON_VARS,
            num_vars=[],
            cat_vars=[column for column in df.columns if column not in (*NON_VARS, *NUM_VARS, TARGET_VAR)],
            target_var=TARGET_VAR,
            model_class=xgb.XGBClassifier,
            model_hyperparams=hyperparams,
            target_mapping=TARGET_MAPPING,
        )
        diff_hps[fold_id] = auc
        del df

In [12]:

pprint.pprint(categorical_ordinal)
pprint.pprint(diff_hps)

{0: 0.8870640689898823,
 1: 0.8844360704739664,
 2: 0.883638042232153,
 3: 0.8923951732346085,
 4: 0.8902355529090424}
{0: 0.880394928465924,
 1: 0.8775112845996109,
 2: 0.8777903769306056,
 3: 0.8869889389457607,
 4: 0.884154993303965}


In [13]:
results = pd.DataFrame.from_dict(
    data={
        "Categorical features only, OrdinalEncoder": categorical_ordinal,
        "Categorical features only, OrdinalEncoder, different hyperparameters": diff_hps
    },
    orient="index",
)
results.head()

Unnamed: 0,0,1,2,3,4
"Categorical features only, OrdinalEncoder",0.887064,0.884436,0.883638,0.892395,0.890236
"Categorical features only, OrdinalEncoder, different hyperparameters",0.880395,0.877511,0.87779,0.886989,0.884155


Minimal change. How about using the numeric features, too?

In [14]:
categorical_and_numeric = {}

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for fold_id in range(5):
        df = pd.read_csv("../data/adult_5_folds.csv")
        auc = run_model.train_cv(
            df=df,
            fold_id=fold_id,
            non_vars=NON_VARS,
            num_vars=NUM_VARS,
            cat_vars=[column for column in df.columns if column not in (*NON_VARS, *NUM_VARS, TARGET_VAR)],
            target_var=TARGET_VAR,
            model_class=xgb.XGBClassifier,
            model_hyperparams=DEFAULT_MODEL_HYPERPARAMS,
            target_mapping=TARGET_MAPPING,
        )
        categorical_and_numeric[fold_id] = auc
        del df

In [15]:
results = pd.DataFrame.from_dict(
    data={
        "Categorical features only, OrdinalEncoder": categorical_ordinal,
        "Categorical features only, OrdinalEncoder, different hyperparameters": diff_hps,
        "Categorical and numeric features, OrdinalEncoder on cats, standard hyperparameters": categorical_and_numeric,
    },
    orient="index",
)
results.head()

Unnamed: 0,0,1,2,3,4
"Categorical features only, OrdinalEncoder",0.887064,0.884436,0.883638,0.892395,0.890236
"Categorical features only, OrdinalEncoder, different hyperparameters",0.880395,0.877511,0.87779,0.886989,0.884155
"Categorical and numeric features, OrdinalEncoder on cats, standard hyperparameters",0.927691,0.924895,0.925445,0.929203,0.928832


Now we will try a naive feature engineering attempt where we create a new feature for each pair of categorical features.

The new feature's values are just the concatenation of the two components', with an underscore between each.

In [None]:
df = pd.read_csv("../data/adult_5_folds.csv")
numeric_features = ["age", "finalwgt", "educational-num", "capital-gain", "hours-per-week",]
df = df.drop(["Unnamed: 0",], axis=1)
categorical_features = [column for column in df.columns if column not in ("Unnamed: 0", "fold_id", "income", *numeric_features)]
pprint.pprint(categorical_features)


In [None]:
df = feature_engineering.categorical_features_pairwise(df, categorical_features)

In [None]:
df.head()

In [None]:
df.to_csv("../data/adult_5_folds_naive_fe.csv", index=False)

In [17]:
pairwise_fe = {}

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for fold_id in range(5):
        df = pd.read_csv("../data/adult_5_folds_naive_fe.csv")
        auc = run_model.train_cv(
            df=df,
            fold_id=fold_id,
            non_vars=NON_VARS,
            num_vars=NUM_VARS,
            cat_vars=[column for column in df.columns if column not in (*NON_VARS, *NUM_VARS, TARGET_VAR)],
            target_var=TARGET_VAR,
            model_class=xgb.XGBClassifier,
            model_hyperparams=DEFAULT_MODEL_HYPERPARAMS,
            target_mapping=TARGET_MAPPING,
        )
        pairwise_fe[fold_id] = auc
        del df

In [18]:
results = pd.DataFrame.from_dict(
    data={
        "Categorical features only, OrdinalEncoder": categorical_ordinal,
        "Categorical features only, OrdinalEncoder, different hyperparameters": diff_hps,
        "Categorical and numeric features, OrdinalEncoder on cats, standard hyperparameters": categorical_and_numeric,
        "Adding pairwise categorical features, OrdinalEncoder on cats, standard hyperparameters": pairwise_fe,
    },
    orient="index",
)
results.head()

Unnamed: 0,0,1,2,3,4
"Categorical features only, OrdinalEncoder",0.887064,0.884436,0.883638,0.892395,0.890236
"Categorical features only, OrdinalEncoder, different hyperparameters",0.880395,0.877511,0.87779,0.886989,0.884155
"Categorical and numeric features, OrdinalEncoder on cats, standard hyperparameters",0.927691,0.924895,0.925445,0.929203,0.928832
"Adding pairwise categorical features, OrdinalEncoder on cats, standard hyperparameters",0.927807,0.924908,0.925472,0.927451,0.928664


Okay; let's try the different hyperparameters too!

In [19]:
pairwise_fe_hyperparams = {}

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for fold_id in range(5):
        df = pd.read_csv("../data/adult_5_folds_naive_fe.csv")
        auc = run_model.train_cv(
            df=df,
            fold_id=fold_id,
            non_vars=NON_VARS,
            num_vars=NUM_VARS,
            cat_vars=[column for column in df.columns if column not in (*NON_VARS, *NUM_VARS, TARGET_VAR)],
            target_var=TARGET_VAR,
            model_class=xgb.XGBClassifier,
            model_hyperparams=hyperparams,
            target_mapping=TARGET_MAPPING,
        )
        pairwise_fe_hyperparams[fold_id] = auc
        del df

results = pd.DataFrame.from_dict(
    data={
        "Categorical features only, OrdinalEncoder": categorical_ordinal,
        "Categorical features only, OrdinalEncoder, different hyperparameters": diff_hps,
        "Categorical and numeric features, OrdinalEncoder on cats, standard hyperparameters": categorical_and_numeric,
        "Adding pairwise categorical features, OrdinalEncoder on cats, standard hyperparameters": pairwise_fe,
        "Pairwise categorical features, different hyperparams": pairwise_fe_hyperparams
    },
    orient="index",
)
results.head()

Unnamed: 0,0,1,2,3,4
"Categorical features only, OrdinalEncoder",0.887064,0.884436,0.883638,0.892395,0.890236
"Categorical features only, OrdinalEncoder, different hyperparameters",0.880395,0.877511,0.87779,0.886989,0.884155
"Categorical and numeric features, OrdinalEncoder on cats, standard hyperparameters",0.927691,0.924895,0.925445,0.929203,0.928832
"Adding pairwise categorical features, OrdinalEncoder on cats, standard hyperparameters",0.927807,0.924908,0.925472,0.927451,0.928664
"Pairwise categorical features, different hyperparams",0.923226,0.919218,0.919084,0.923459,0.92288


I skipped the TargetEncoder example; it was just a bit too much much "more of the same"