In [10]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import (
    LabelEncoder, OneHotEncoder, StandardScaler
)
from xgboost import XGBClassifier

In [11]:
dataset = pd.read_csv('datasets/churn.csv')
X = dataset.iloc[:, range(3, 13)].values
y = dataset.iloc[:, -1].values

In [12]:
# because we have categorical values we need to encode them
country_label_encoder = LabelEncoder()
X[:, 1] = country_label_encoder.fit_transform(X[:, 1])

gender_label_encoder = LabelEncoder()
X[:, 2] = gender_label_encoder.fit_transform(X[:, 2])

# now we one hot encode them for categorical data
# that has more than 2 values. (only the country in column 1)
one_hot_encoder = OneHotEncoder(categorical_features=[1])
X = one_hot_encoder.fit_transform(X).toarray()

# to avoid the dummy variable trap we remove one variable
X = X[:, 1:]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=1/5, random_state=0
)

In [14]:
classifier = XGBClassifier()
classifier = classifier.fit(X_train, y_train)

In [15]:
confusion_matrix(y_test, classifier.predict(X_test))

array([[1521,   74],
       [ 197,  208]])

In [17]:
# because testing once on 1 training set does not have to be very accurate.
# Therefore we do it with a k-fold cross validation, 10 training sets to test on.
accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
f'Accuracy; {accuracies.mean() * 100:.4f}% - Standard Deviation; {accuracies.std() * 100:.2f}%'

'Accuracy; 86.2999% - Standard Deviation; 1.07%'