In [13]:
!pip install catboost



In [14]:
# Author: Frank Gao, NETID: QXG190000
# Github Link: https://github.com/FrankQixiangGao/Coding-Challenge-S22

# Two External Resources
# [1] https://catboost.ai/en/docs/concepts/parameter-tuning
# [2] https://www.kaggle.com/mitribunskiy/tutorial-catboost-overview


import numpy as np # Math Functions
import pandas as pd # data processing
from catboost import CatBoostClassifier # An algorithmn for gradient boosting on decision trees, you could see it as a Optimtized Version of XGBoost. 
from sklearn.model_selection import train_test_split # Classic Train_test_split function, it will split arrays or matrices into different train and test subsets. Default ratio: 25 percents of all data will be assigned to test instance



In [15]:
Train_Instance = pd.read_csv("mushrooms.csv") # Read CSV file

X = Train_Instance.iloc[: , 1:] # Assign all file except first columns to Variable X. (First Column has Class)
Y = Train_Instance.iloc[:, 0] # Assign first Column to Variable Y 
X_train, X_valid, y_train, y_valid = train_test_split(X, Y) # Split Train_Instance into Train_instance and Valid_Instance

In [16]:
# This Three Lines care come from External Resource (Tutorial: CatBoost Overview), I also done some minor modification. 

# Three line separate Categorical features from all features (From the result we know that all features are Categorical)
cat_features_names = X.columns
cat_features = [X.columns.get_loc(col) for col in cat_features_names]
print(cat_features)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]


In [22]:

SEED = 2 # Random Seed
 
import sys
from catboost import Pool

pool = Pool(X_valid, y_valid, cat_features=cat_features) # Pool is like a struct that has multiple variable with different data structures

# The Following Codes are mostly written by me. But I acknowledge that I got the basic idea of how to constuct a CatboostClassifier from "Tutorial: Catboost Overview". 
# After read Documentation of Catboost, I found a couple of parameters that can optimize the model

params = {'loss_function':'Logloss', # logloss is a classifcation metric based on probablity 
          'eval_metric':'AUC',
          'verbose': 100, # Logging
          'random_seed': SEED, 
          'max_depth': 8, # Max_depth of each sub decision tree
          'early_stopping_rounds': 200, 
          'cat_features':cat_features # Tell the models which columns of the dataset has categorical variables. 
         }

cbc_1 = CatBoostClassifier(**params)

all_features = list(range(X.shape[1]))
# 
cbc_2 = cbc_1.select_features(
                X_train,
                y=y_train,
                eval_set=pool,
                features_for_select=all_features, # How many features will be selected
                num_features_to_select=22, # How many features will remain after selection, I choose 22 in here because after I found there is no fake or useless feature on mushroom dataset
                algorithm="RecursiveByShapValues", # Built-in Algorithm for Optimize the decision trees
                steps=1,
                shap_calc_type="Approximate",
                train_final_model=True,
                logging_level="Verbose",
                plot=True,
                log_cout=sys.stdout,
                log_cerr=sys.stderr)


The number of features selection steps (1) is greater than the number of features to eliminate (0). The number of steps was reduced to 0.


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.049518
Train final model
0:	test: 0.9868458	best: 0.9868458 (0)	total: 16.5ms	remaining: 16.5s
100:	test: 1.0000000	best: 1.0000000 (10)	total: 5.12s	remaining: 45.6s
200:	test: 1.0000000	best: 1.0000000 (10)	total: 11.4s	remaining: 45.3s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 1
bestIteration = 10

Shrink model to first 11 iterations.
