In [50]:
#Importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform
from scipy.stats import randint

In [2]:
#Importing dataset
data = pd.read_csv('C:/Users/bhavi/OneDrive/Desktop/Data/XGboost_Data.csv')

In [3]:
#View the shape of the data
data.shape

(683, 11)

In [4]:
#View the top five rows
data.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [22]:
#Data splitting into X & y
X = data.drop(columns=['Class','Sample code number'], axis = 1)
y = data['Class']
print('X Shape = ', X.shape)
print('y Shape = ', y.shape)

X Shape =  (683, 9)
y Shape =  (683,)


In [23]:
#X & y data futher splitting into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state = 42)
print('X_train Shape', X_train.shape)
print('X_test Shape', X_test.shape)
print('y_train Shape', y_train.shape)
print('y_test Shape', y_test.shape)

X_train Shape (546, 9)
X_test Shape (137, 9)
y_train Shape (546,)
y_test Shape (137,)


In [59]:
#Creating function
def classifier(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    pred_y = model.predict(x_test)
    accuracy = accuracy_score(y_test, pred_y)
    cv = cross_val_score(estimator=model, X=x_train, y=y_train, scoring='roc_auc', cv=5,n_jobs=-1)
    cv_score = cv.mean()
    return {'Accuracy':accuracy, 'CV Score':cv_score}

# Creating XGBoost on Default Parameters

In [60]:
#Implementing XG Boosting
xgb = XGBClassifier()
classifier(xgb, X_train, X_test, y_train, y_test)

{'Accuracy': 0.9562043795620438, 'CV Score': 0.990682463285203}

In [38]:
# xgb1 = XGBClassifier(base_score=0.5, 
#                      colsample_bylevel=1, 
#                      colsample_bytree=1,
#                      gamma=0, 
#                      learning_rate=0.1, 
#                      max_delta_step=0,
#                      max_depth=10, 
#                      min_child_weight=1,
#                      missing=None,
#                      n_estimators=100, 
#                      nthread=-1, 
#                      objective='binary:logistic',
#                      reg_alpha=0, 
#                      reg_lambda=1, 
#                      scale_pos_weight=1, 
#                      seed=0,
#                      subsample=1)

In [62]:
param_dist = {
    'max_depth': range(3, 10, 2),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05],
    'min_child_weight':range(1,6,2)
}

gscv = GridSearchCV(estimator=xgb, param_grid=param_dist, scoring='roc_auc', n_jobs=-1, cv=5, verbose=0)
gscv.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0, gpu_id=-1,
                                     importance_type='gain',
                                     interaction_constraints='',
                                     learning_rate=0.300000012,
                                     max_delta_step=0, max_depth=6,
                                     min_child_weight=1, missing=nan,
                                     monotone_constraints='()',
                                     n_estimators=100, n_jobs=0,
                                     num_parallel_tree=1, random_state=0,
                                     reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, subsample=1,
                                     tree_method='exact', validate_parameters=1,
         

In [63]:
gscv.best_score_

0.9937101485046691

In [64]:
gscv.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=60, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [65]:
#Implementing XG Boosting
xgb_tuned = gscv.best_estimator_
classifier(xgb_tuned, X_train, X_test, y_train, y_test)

{'Accuracy': 0.948905109489051, 'CV Score': 0.9937101485046691}

In [15]:
#Creating confusion matrix
confusion_matrix(y_test, y_pred)

array([[85,  2],
       [ 1, 49]], dtype=int64)

In [16]:
#Accuracy
accuracy_score(y_test, y_pred)

0.9781021897810219

In [17]:
#Importing K fold cross validation
from sklearn.model_selection import cross_val_score

In [19]:
#Implementing K fold cross validation
accuracies = cross_val_score(estimator=classifier, X = X, y = y, cv = 10)
accuracies

array([0.97101449, 0.95652174, 0.94202899, 0.92647059, 0.98529412,
       0.95588235, 0.95588235, 1.        , 0.97058824, 0.98529412])

In [29]:
#Checking Mean and standard deviation of accuracies
print('Accuracy : {:.2f}%'.format(accuracies.mean()*100))
print('Standard Deviation : {:.2f}%'.format(accuracies.std()*100))

Accuracy : 96.49%
Standard Deviation : 2.09%
