Catboost use different names for parameters compared to sklearns model.  
Here is a nice guide for catboost https://www.kaggle.com/code/mitribunskiy/tutorial-catboost-overview/notebook  
Here is likewise a nice guide: https://coderzcolumn.com/tutorials/machine-learning/catboost-an-in-depth-guide-python#2  
Otherwise the official homepage have more information https://catboost.ai/en/docs/  


In [1]:
#Import
# Data wrangling
import numpy as np
from sklearn.model_selection import train_test_split
# Model
from catboost import CatBoostClassifier
# performance measure
from sklearn.metrics import accuracy_score
# Grid search
from sklearn.model_selection import GridSearchCV

In [2]:
# ------ Load data ------
# test data (no labels (y_test) since we that way can't see the results)
X_test = np.load("../Common/data/X_test.npy")
# Validation and training
X_train, X_val, y_train, y_val = train_test_split(
                                                  np.load("../Common/data/X_train.npy"), # X data
                                                  np.load("../Common/data/y_train.npy"), # y data (labels)
                                                  test_size = 0.5,
                                                  random_state = 42
                                                  )


## Playing around  
This is just a bit of playing, next section I will do grid search

In [26]:
# ------ Create and test model ------
model = CatBoostClassifier(
                           n_estimators = 1000, # Default = 1000 - How many slow learners to have in the forrest (also called iterations)
                           max_leaves = 31, # Default = 31 - The same as max_features in sklearn (how many features to consider)
                           min_data_in_leaf = 1, # Default = 1 - The same as min_samples_leaf (The minimum number of samples to have in a leaf)
                           max_depth  = 6, # Default = 6 - The max depth of our slow learners 
                           learning_rate= 0.3, # Default = 0.03 - The learning rate for gradient descent algorithm
                           verbose = 100, # Print training process every 100 iteration
                           random_state = 42, # For reproducibility
                           early_stopping_rounds = 10, #, No default - Preventing overfitting
                           bootstrap_type= "No",  # I think default it No bootstrapping Bayesian
                           eval_metric= 'AUC' # Default: Logloss - Can either evaluate on the loss function or the auccuracy
                           loss_function= "MultiClass" #  For 2-class classification use 'LogLoss' or 'CrossEntropy'. For multiclass use 'MultiClass'
                           )

model.fit(X_val, y_val, eval_set=(X_train, y_train), plot = True)
y_train_hat = model.predict(X_train)
accuracy = accuracy_score(y_train, y_train_hat)
print(accuracy * 100)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.7793054	best: 0.7793054 (0)	total: 37.1ms	remaining: 9.23s
100:	test: 0.9494087	best: 0.9494087 (100)	total: 2.63s	remaining: 3.88s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.9496872283
bestIteration = 111

Shrink model to first 112 iterations.
87.57083333333333


# Grid search  
Here I will perform grid search for the final model.  
Here you can read more: https://effectiveml.com/using-grid-search-to-optimise-catboost-parameters.html  


In [3]:
# Define parameters to perform grid-search on:
model_params_search = {
            "n_estimators": [250, 500, 1000], # Default = 1000 - How many slow learners to have in the forrest
            "max_leaves": [10, 31, 50], # Default = 31 - The same as max_features in sklearn (how many features to consider)
            "min_data_in_leaf": [1, 2, 3], # Default = 1 - The same as min_samples_leaf (The minimum number of samples to have in a leaf)
            "max_depth": [1, 2, 6], # Default = 6 - The max depth of our slow learners 
            "learning_rate":  [0.001, 0.01, 0.3] , # Default = 0.03 - The learning rate for gradient descent algorithm
            "bootstrap_type": ["Bayesian", "Bernoulli", "MVS"],  # I think default it No bootstrapping
}

In [4]:
model = CatBoostClassifier(
                           random_state = 42, # For reproducibility
                           verbose = 1000,
                           early_stopping_rounds = 10 #, No default - Preventing overfitting
                           )

# Grid search
# found the guide here: https://youtu.be/N4rqz8Z4XOM 
grid_search = GridSearchCV(
                           estimator=model,
                           param_grid = model_params_search,
                           n_jobs=1,
                           cv = 3,
                           scoring = "accuracy",
                           error_score=0
                           )

grid_result = grid_search.fit(X_val, y_val)

KeyboardInterrupt: 

In [None]:
print(grid_result.best_params_)

### Test model  

In [None]:
final_model = model.set_params(
    grid_result.best_params_
)
# final_model.fit(np.concatenate(X_train, X_val), np.concatenate(y_train, y_val))
final_model.fit(X_val, y_val)
final_model_pred = final_model.predict(X_train)
accuracy = accuracy_score(y_train, final_model_pred)
print(accuracy * 100)

'nan_mode': Min
'eval_metric': MultiClass
'iterations': 720
'sampling_frequency': PerTree
'leaf_estimation_method': Newton
'od_pval': 0
'grow_policy': SymmetricTree
'penalties_coefficient': 1
'boosting_type': Plain
'model_shrink_mode': Constant
'feature_border_type': GreedyLogSum
'bayesian_matrix_reg': 0.10000000149011612
'eval_fraction': 0
'force_unit_auto_pair_weights': False
'l2_leaf_reg': 3
'random_strength': 1
'od_type': Iter
'rsm': 1
'boost_from_average': False
'model_size_reg': 0.5
'pool_metainfo_options': {'tags': {}}
'use_best_model': True
'od_wait': 6
'class_names': [0, 1, 2]
'random_seed': 42
'depth': 2
'posterior_sampling': False
'border_count': 254
'bagging_temperature': 1
'classes_count': 0
'auto_class_weights': None
'sparse_features_conflict_fraction': 0
'leaf_estimation_backtracking': AnyImprovement
'best_model_min_trees': 1
'model_shrink_rate': 0
'min_data_in_leaf': 2
'loss_function': MultiClass
'learning_rate': 0.30000001192092896
'score_function': Cosine
'task_type':

# Create final Model

In [None]:
#Import
# Data wrangling
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
# Model
from catboost import CatBoostClassifier
# performance measure
from sklearn.metrics import accuracy_score
# Grid search
from sklearn.model_selection import GridSearchCV

# ------ Load data ------
# test data (no labels (y_test) since we that way can't see the results)
X_test = np.load("../Common/data/X_test.npy")
# Validation and training
X_train = np.load("../Common/data/X_train.npy")
y_train = np.load("../Common/data/y_train.npy")

# ------ Create and train model ------ 
final_model = CatBoostClassifier(
                           verbose = 100, # Print training process every 100 iteration
                           random_state = 42, # For reproducibility
                           early_stopping_rounds = 10 #, No default - Preventing overfitting
                           )
                           
best_params = grid_result.best_params_

final_model = final_model.set_params(best_params)
final_model.fit(X_train, y_train)

# ------ Create and save predictions ------
final_model_pred = final_model.predict(X_test)
accuracy = accuracy_score(y_train, final_model_pred)



y_test_hat = final_model.predict(X_test)
y_test_hat_pd = pd.DataFrame({
    'Id': list(range(len(y_test_hat))),
    'Category': y_test_hat,
})


# After you make your predictions, you should submit them on the Kaggle webpage for our competition.
# Below is a small check that your output has the right type and shape
assert isinstance(y_test_hat_pd, pd.DataFrame)
assert all(y_test_hat_pd.columns == ['Id', 'Category'])

# If you pass the checks, the file is saved.
y_test_hat_pd.to_csv('y_test_hat.csv', index=False)