The two most popular classification objectives are:

    binary:logistic - binary classification (the target contains only two classes, i.e., cat or dog)

    multi:softprob - multi-class classification (more than two classes in the target, i.e., apple/orange/banana)


In [1]:
import seaborn as sns

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import warnings


warnings.filterwarnings("ignore")


diamonds = sns.load_dataset("diamonds")

diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [2]:
diamonds.describe(exclude=np.number)

Unnamed: 0,cut,color,clarity
count,53940,53940,53940
unique,5,7,8
top,Ideal,G,SI1
freq,21551,11292,13065


In [6]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

X, y = diamonds.drop("cut", axis=1), diamonds[['cut']]

print(X)

print('-----------------')

print(y)

# Encode y to numeric
y_encoded = OrdinalEncoder().fit_transform(y)

print('-----------------')
print(y_encoded)

# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to pd.Categorical
for col in cats:
   X[col] = X[col].astype('category')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, random_state=1, stratify=y_encoded)

       carat color clarity  depth  table  price     x     y     z
0       0.23     E     SI2   61.5   55.0    326  3.95  3.98  2.43
1       0.21     E     SI1   59.8   61.0    326  3.89  3.84  2.31
2       0.23     E     VS1   56.9   65.0    327  4.05  4.07  2.31
3       0.29     I     VS2   62.4   58.0    334  4.20  4.23  2.63
4       0.31     J     SI2   63.3   58.0    335  4.34  4.35  2.75
...      ...   ...     ...    ...    ...    ...   ...   ...   ...
53935   0.72     D     SI1   60.8   57.0   2757  5.75  5.76  3.50
53936   0.72     D     SI1   63.1   55.0   2757  5.69  5.75  3.61
53937   0.70     D     SI1   62.8   60.0   2757  5.66  5.68  3.56
53938   0.86     H     SI2   61.0   58.0   2757  6.15  6.12  3.74
53939   0.75     D     SI2   62.2   55.0   2757  5.83  5.87  3.64

[53940 rows x 9 columns]
-----------------
             cut
0          Ideal
1        Premium
2           Good
3        Premium
4           Good
...          ...
53935      Ideal
53936       Good
53937  Very

In [5]:
import xgboost as xgb

# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [7]:
params = {"objective": "multi:softprob", "tree_method": "hist", "num_class": 5}
n = 1000

results = xgb.cv(
   params, dtrain_clf,
   num_boost_round=n,
   nfold=5,
   metrics=["mlogloss", "auc", "merror"],
)

In [8]:
results.keys()

Index(['train-mlogloss-mean', 'train-mlogloss-std', 'train-auc-mean',
       'train-auc-std', 'train-merror-mean', 'train-merror-std',
       'test-mlogloss-mean', 'test-mlogloss-std', 'test-auc-mean',
       'test-auc-std', 'test-merror-mean', 'test-merror-std'],
      dtype='object')

In [9]:
results['test-auc-mean'].max()

0.9403142643933986