In [112]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn import metrics

In [113]:
# Load the dataset
df = pd.read_csv("diabetes.csv")

In [114]:
# Define predictors and target
predictors = ['Age', 'Insulin']
target = 'Outcome'

In [115]:
# Preprocess the data
for f in df.columns:
    if df[f].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(df[f].values))
        df[f] = lbl.transform(list(df[f].values))

df.fillna(-999, inplace=True)

In [116]:
# Select independent and dependent variables
X = df[predictors]
y = df[target].values

In [117]:
# Normalize the features
X = StandardScaler().fit_transform(X)

In [118]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2017)

In [119]:
# Define model parameters
num_rounds = 100
kfold = StratifiedKFold(n_splits=5, random_state=2017, shuffle=True)

In [125]:
# Initialize the XGBoost classifier
clf_XGB = XGBClassifier(
    max_depth=3,
    n_estimators=num_rounds,
    objective='binary:logistic',
    learning_rate=0.1,
    verbosity=1,
    random_state=2017,
    early_stopping_rounds=20
)

In [126]:
# Print the XGBoost version
print(xgb.__version__)

2.1.1


In [127]:
# Perform manual cross-validation
cv_results = []
for train_idx, val_idx in kfold.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
    
    clf_XGB.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], verbose=False)
    
    val_pred = clf_XGB.predict(X_val_fold)
    accuracy = metrics.accuracy_score(y_val_fold, val_pred)
    cv_results.append(accuracy)

print("\nxgBoost - CV Train: %.2f" % (sum(cv_results) / len(cv_results)))


xgBoost - CV Train: 0.69


In [128]:
# Fit the model on the entire training set and evaluate on the test set
clf_XGB.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
train_accuracy = metrics.accuracy_score(y_train, clf_XGB.predict(X_train))
test_accuracy = metrics.accuracy_score(y_test, clf_XGB.predict(X_test))

In [129]:
print("xgBoost - Train: %.2f" % train_accuracy)
print("xgBoost - Test: %.2f" % test_accuracy)

xgBoost - Train: 0.73
xgBoost - Test: 0.74
