In [1]:
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=2000, centers=3, n_features=3,random_state=0)

In [2]:
print(X.shape)

(2000, 3)


In [3]:
print(y)

[1 2 1 ... 1 2 1]


In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [7]:
from sklearn.metrics import accuracy_score

In [8]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier( random_state=42)
clf.fit(X_train,y_train)
print("Accuracy of train:",clf.score(X_train,y_train))
print("Accuracy of test:",clf.score(X_test,y_test))

Accuracy of train: 1.0
Accuracy of test: 0.9983333333333333


In [9]:
clf = DecisionTreeClassifier(max_depth=1 , random_state=42)
clf.fit(X_train,y_train)
print("Accuracy of train:",clf.score(X_train,y_train))
print("Accuracy of test:",clf.score(X_test,y_test))

Accuracy of train: 0.6685714285714286
Accuracy of test: 0.66


In [10]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=2 , random_state=42)
clf.fit(X_train,y_train)
print("Accuracy of train:",clf.score(X_train,y_train))
print("Accuracy of test:",clf.score(X_test,y_test))

Accuracy of train: 0.9992857142857143
Accuracy of test: 0.9983333333333333


In [11]:
import xgboost as xgb

In [12]:
dmatrix_train = xgb.DMatrix(data=X_train, label=y_train)
dmatrix_test = xgb.DMatrix(data=X_test, label=y_test)

In [13]:
param = {'max_depth':3, 
         'eta':1, 
         'objective':'multi:softprob', 
         'num_class':3}

num_round = 5
model = xgb.train(param, dmatrix_train, num_round)

In [14]:
preds = model.predict(dmatrix_test)
preds[:10]

array([[0.9963404 , 0.00182813, 0.00183146],
       [0.9963404 , 0.00182813, 0.00183146],
       [0.00199098, 0.9961791 , 0.00182992],
       [0.00199098, 0.9961791 , 0.00182992],
       [0.00199869, 0.00184728, 0.996154  ],
       [0.9963404 , 0.00182813, 0.00183146],
       [0.00199098, 0.9961791 , 0.00182992],
       [0.00199869, 0.00184728, 0.996154  ],
       [0.9963404 , 0.00182813, 0.00183146],
       [0.9963404 , 0.00182813, 0.00183146]], dtype=float32)

In [15]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [16]:

best_preds = np.asarray([np.argmax(line) for line in preds])

In [17]:
best_preds

array([0, 0, 1, 1, 2, 0, 1, 2, 0, 0, 0, 1, 2, 1, 1, 1, 0, 0, 1, 2, 0, 2,
       0, 0, 0, 1, 0, 0, 2, 0, 2, 1, 2, 0, 0, 1, 2, 1, 2, 1, 0, 0, 1, 0,
       2, 2, 1, 2, 0, 0, 2, 2, 2, 0, 2, 1, 1, 1, 0, 2, 2, 0, 1, 1, 1, 2,
       2, 1, 0, 2, 2, 0, 1, 1, 0, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 0, 2, 1,
       1, 1, 1, 0, 2, 2, 2, 1, 2, 2, 0, 2, 0, 1, 0, 1, 2, 1, 2, 0, 2, 1,
       1, 1, 0, 1, 0, 1, 1, 2, 0, 2, 1, 2, 0, 1, 0, 0, 0, 2, 0, 0, 1, 2,
       0, 1, 2, 2, 0, 0, 1, 2, 2, 1, 2, 2, 1, 2, 1, 0, 2, 0, 0, 0, 0, 2,
       2, 1, 0, 0, 1, 1, 2, 0, 1, 2, 2, 0, 2, 2, 0, 0, 1, 1, 0, 0, 2, 2,
       2, 1, 2, 2, 0, 2, 0, 1, 0, 2, 2, 1, 1, 0, 1, 0, 1, 1, 2, 1, 0, 0,
       1, 1, 1, 2, 1, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 2, 2, 1, 2, 1, 2, 1, 0, 2, 1, 1, 1, 0, 2, 2, 0, 1, 2, 2,
       0, 1, 0, 0, 2, 1, 0, 1, 2, 1, 2, 2, 2, 1, 0, 0, 0, 1, 1, 2, 2, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 2, 2, 0, 1, 0, 2, 2, 0, 2, 1, 2, 0, 1, 1,
       2, 0, 2, 2, 2, 2, 0, 1, 1, 1, 0, 1, 2, 0, 0,

In [20]:
print("Accuracy = {}".format(accuracy_score(y_test, best_preds)))

Accuracy = 0.9983333333333333


In [21]:
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV  

param_dict = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2),
    'learning_rate': [0.001,0.01,0.1,1],
    'n_estimators': [200,500,1000]
    
}

xgc = XGBClassifier(booster='gbtree', learning_rate =0.01, n_estimators=200, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1, seed=27)

clf = GridSearchCV(xgc,param_dict,cv=2,n_jobs = -1).fit(X_train,y_train)

print("Tuned: {}".format(clf.best_params_)) 
print("Mean of the cv scores is {:.6f}".format(clf.best_score_))
print("Train Score {:.6f}".format(clf.score(X_train,y_train)))
print("Test Score {:.6f}".format(clf.score(X_test,y_test)))
print("Seconds used for refitting the best model on the train dataset: {:.6f}".format(clf.refit_time_))

Tuned: {'learning_rate': 0.001, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 500}
Mean of the cv scores is 0.998571
Train Score 0.999286
Test Score 0.998333
Seconds used for refitting the best model on the train dataset: 1.055834
