### **Testing Model With CatBoost**




In [0]:
#Download and Install CatBoost Module
!pip install catboost

In [0]:
#Install Necessary Libraries
import catboost as cb
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn import metrics
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [7]:
#Uploading created Dataset 
from google.colab import files
uploaded = files.upload()


Saving data_extract.csv to data_extract.csv


In [0]:
#Reading dataset as Dataframe
import io
dataset = pd.read_csv(io.BytesIO(uploaded['data_extract.csv']), index_col = 0)

In [0]:
dataset = dataset.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

# Splitting dataset into independent and dependent sets where soil type is the dependent set
y = dataset["Soil type"]
X = dataset.drop("Soil type", axis = 1)

#Splitting the sets into train and test sets with test set size set to 30% of the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 50) 


### Tuning CatBoost's Hyper-parameters using GridSearch Algorithm 

In [23]:
model = cb.CatBoostClassifier(silent=True)
#model.fit(X_train, y_train)
gridParams = {'depth': [4, 7, 10],
          'learning_rate' : [0.03, 0.1, 0.15],
         'l2_leaf_reg': [1,4,9],
         'iterations': [300]}

grid = GridSearchCV(model, gridParams,
                    verbose=0,
                    cv=4,
                    n_jobs=2)
#fitting gridsearch on train data
grid.fit(X_train, y_train)
print(grid.best_params_)

{'depth': 10, 'iterations': 300, 'l2_leaf_reg': 1, 'learning_rate': 0.15}


The best parameters from the set are listed in the dictionary above. 


*   Depth - 10
*   Number of Iterations - 300
*   l2_leaf_reg - 1
*   Learning rate - 0.15





In [24]:
clf = cb.CatBoostClassifier(depth=10, iterations= 300, l2_leaf_reg= 1, learning_rate= 0.15, silent=True)
clf.fit(X_train, y_train)
Y_sum = clf.predict(X_test)

#Model Accuracy
accuracy = metrics.accuracy_score(y_test, Y_sum)
print('Accuracy: %f' % accuracy)
#Model Precision
precision = metrics.precision_score(y_test, Y_sum, average = 'macro')
print('Precision: %f' % precision)
#Model Recall
recall = metrics.recall_score(y_test, Y_sum, average = 'macro')
print('Recall: %f' % recall)
#Model F1 score 
f1 = metrics.f1_score(y_test, Y_sum, average = 'macro')
print('F1 score: %f' % f1)

Accuracy: 0.979233
Precision: 0.944002
Recall: 0.874019
F1 score: 0.902632


The results are very similar to that of LightGBM algorithm and is almost negligible. 

In [25]:
#Model Cross validation score 
print('Cross-Validation Score: %f' % np.mean(cross_val_score(clf, X, y, cv=10)))

Cross-Validation Score: 0.736281


The cross-validation score is about 1% larger than LightGBM's. Therefore, the resuts are very similar. Also, Catboost takes a much larger amount of time to successfully train while LightGBM is much faster and gives almost the same result. In my opinion, LightGBM would be the better choice.