# Import modules

In [1]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
X, y = make_classification(n_samples=1000, n_features = 20, n_informative=14, n_redundant=6)

In [3]:
X

array([[  0.6138852 ,   3.01107618,  -4.64105458, ...,   6.08585398,
          2.18231426,   1.03730784],
       [ -7.37379438,   2.65491096,  -3.4477221 , ...,   8.26155796,
         -1.83247619,   1.26928983],
       [-10.02052523,   1.66526139,   0.19168797, ...,  -6.07836172,
         -5.23794193,   3.5857609 ],
       ...,
       [ -1.96825208,   0.06131655,  -1.04752908, ...,   0.49961164,
         -2.36922019,   0.26618006],
       [ -3.65753601,   2.35198974,  -0.32481643, ...,   5.13529286,
         -0.09210246,   1.47995876],
       [  5.03728643,  -1.80006082,   1.38211876, ...,   2.95568053,
          0.33320538,   1.41727526]], shape=(1000, 20))

In [4]:
y

array([1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [6]:
model = CatBoostClassifier()

In [7]:
model.fit(X_train, y_train)

Learning rate set to 0.009366
0:	learn: 0.6844055	total: 126ms	remaining: 2m 5s
1:	learn: 0.6765862	total: 135ms	remaining: 1m 7s
2:	learn: 0.6702076	total: 142ms	remaining: 47.2s
3:	learn: 0.6644011	total: 152ms	remaining: 37.8s
4:	learn: 0.6573358	total: 160ms	remaining: 31.9s
5:	learn: 0.6510738	total: 170ms	remaining: 28.2s
6:	learn: 0.6438976	total: 177ms	remaining: 25.2s
7:	learn: 0.6383613	total: 186ms	remaining: 23s
8:	learn: 0.6316196	total: 192ms	remaining: 21.1s
9:	learn: 0.6261782	total: 198ms	remaining: 19.6s
10:	learn: 0.6205294	total: 202ms	remaining: 18.2s
11:	learn: 0.6135348	total: 206ms	remaining: 16.9s
12:	learn: 0.6084995	total: 211ms	remaining: 16s
13:	learn: 0.6036794	total: 214ms	remaining: 15.1s
14:	learn: 0.5974651	total: 217ms	remaining: 14.2s
15:	learn: 0.5922390	total: 220ms	remaining: 13.5s
16:	learn: 0.5871828	total: 222ms	remaining: 12.9s
17:	learn: 0.5827068	total: 226ms	remaining: 12.3s
18:	learn: 0.5784584	total: 229ms	remaining: 11.8s
19:	learn: 0.57

<catboost.core.CatBoostClassifier at 0x12ba2953a50>

In [8]:
y_pred = model.predict(X_test)

In [9]:
accuracy_score(y_test, y_pred)

0.94

In [10]:
# overfit test
train_pred = model.predict(X_train)

print("Train Accuracy:", accuracy_score(y_train, train_pred))
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Train Accuracy: 1.0
Test Accuracy: 0.94


# Hyperparameter Tuning

* iterations
* learning_rate
* depth
* l2_leaf_reg
* border_count

In [20]:
param_grid = {
    'iterations': [100, 200],
    'learning_rate': [0.01, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5],
    'border_count': [32, 64, 128]
}

In [None]:
# grid_search = GridSearchCV(estimator=CatBoostClassifier(), param_grid=param_grid, cv=3)

In [None]:
# grid_search.fit(X_train, y_train)

In [23]:
model = CatBoostClassifier(verbose=0)
tuned_model = model.grid_search(param_grid, X_train, y_train, cv=3)


bestTest = 0.4292845698
bestIteration = 99

0:	loss: 0.4292846	best: 0.4292846 (0)	total: 164ms	remaining: 17.5s

bestTest = 0.156283248
bestIteration = 95

1:	loss: 0.1562832	best: 0.1562832 (1)	total: 258ms	remaining: 13.7s

bestTest = 0.4317474398
bestIteration = 99

2:	loss: 0.4317474	best: 0.1562832 (1)	total: 357ms	remaining: 12.5s

bestTest = 0.1771088435
bestIteration = 99

3:	loss: 0.1771088	best: 0.1562832 (1)	total: 505ms	remaining: 13.1s

bestTest = 0.436948501
bestIteration = 99

4:	loss: 0.4369485	best: 0.1562832 (1)	total: 716ms	remaining: 14.8s

bestTest = 0.1660247282
bestIteration = 99

5:	loss: 0.1660247	best: 0.1562832 (1)	total: 814ms	remaining: 13.8s

bestTest = 0.3292699838
bestIteration = 199

6:	loss: 0.3292700	best: 0.1562832 (1)	total: 1.05s	remaining: 15.1s

bestTest = 0.1150489337
bestIteration = 192

7:	loss: 0.1150489	best: 0.1150489 (7)	total: 1.33s	remaining: 16.7s

bestTest = 0.3392948182
bestIteration = 199

8:	loss: 0.3392948	best: 0.1150489 (7)	tot

In [26]:
print(tuned_model['params'])

{'border_count': 128, 'depth': 6, 'learning_rate': 0.1, 'l2_leaf_reg': 1, 'iterations': 200}


In [27]:
best_params = tuned_model['params']

best_model = CatBoostClassifier(**best_params, verbose=0)
best_model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x12ba45f8e90>

In [28]:
y_pred = best_model.predict(X_test)

In [29]:
accuracy_score(y_test, y_pred)

0.945