# XGboost

This notebook is inpired by this [article](https://www.kdnuggets.com/2017/03/simple-xgboost-tutorial-iris-dataset.html) that showcases XGBoost against Iris dataset

In [1]:
import numpy as np
import xgboost as xgb
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.datasets import dump_svmlight_file
from sklearn.metrics import precision_score, accuracy_score
import pickle


In [2]:
# Import Iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [3]:
# Create Training and Test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# use DMatrix for xgboost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [5]:
# use svmlight file for xgboost
dump_svmlight_file(X_train, y_train, 'dtrain.svm', zero_based=True)
dump_svmlight_file(X_test, y_test, 'dtest.svm', zero_based=True)
dtrain_svm = xgb.DMatrix('dtrain.svm')
dtest_svm = xgb.DMatrix('dtest.svm')

[15:46:58] 120x4 matrix with 480 entries loaded from dtrain.svm
[15:46:58] 30x4 matrix with 120 entries loaded from dtest.svm


In [6]:
# set xgboost params
param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 20  # the number of training iterations

In [7]:
#------------- numpy array ------------------
# training and testing - numpy matrices
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)


In [8]:
# extracting most confident predictions
best_preds = np.asarray([np.argmax(line) for line in preds])
print("Numpy array precision: {0:.2f}".format(precision_score(y_test, best_preds, average='macro')))
print("Numpy array accuracy: {0:.2f}".format(accuracy_score(y_test, best_preds)))

Numpy array precision: 1.00
Numpy array accuracy: 1.00


In [9]:
# ------------- svm file ---------------------
# training and testing - svm file
bst_svm = xgb.train(param, dtrain_svm, num_round)
preds = bst.predict(dtest_svm)

In [10]:
# extracting most confident predictions
best_preds_svm = [np.argmax(line) for line in preds]
print("Svm file precision: {0:.2f}".format(precision_score(y_test, best_preds_svm, average='macro')))
# --------------------------------------------

Svm file precision: 1.00


In [11]:
# dump the models
bst.dump_model('dump.raw.txt')
bst_svm.dump_model('dump_svm.raw.txt')

In [12]:
pickle.dump(bst, open('bst_model.pkl', 'wb'))
pickle.dump(bst_svm, open('bst_svm_model.pkl', 'wb'))