# Тестирование XGBoost на датасете из DataCamp

In [1]:
import os
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)

## Load and preview dataset

In [2]:
churn_data = pd.read_csv('C:/DATA/DataCamp/XGBoost/datasets/churn_data.csv')

In [3]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 0 to 49999
Data columns (total 13 columns):
avg_dist                       50000 non-null float64
avg_rating_by_driver           49799 non-null float64
avg_rating_of_driver           41878 non-null float64
avg_inc_price                  50000 non-null float64
inc_pct                        50000 non-null float64
weekday_pct                    50000 non-null float64
fancy_car_user                 50000 non-null bool
city_Carthag                   50000 non-null int64
city_Harko                     50000 non-null int64
phone_iPhone                   50000 non-null int64
first_month_cat_more_1_trip    50000 non-null int64
first_month_cat_no_trips       50000 non-null int64
month_5_still_here             50000 non-null int64
dtypes: bool(1), float64(6), int64(6)
memory usage: 5.0 MB


In [4]:
churn_data.head(20).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
avg_dist,3.67,8.26,0.77,2.36,3.13,10.56,3.95,2.04,4.36,2.37,4.28,3.81,20.29,3.04,26.01,13.2,10.86,2.38,6.83,12.08
avg_rating_by_driver,5,5,5,4.9,4.9,5,4,5,5,5,4.9,5,5,5,5,5,3,4.8,4.8,5
avg_rating_of_driver,4.7,5,4.3,4.6,4.4,3.5,,5,4.5,,5,4,,4,,,5,3.9,4.6,5
avg_inc_price,1.1,1,1,1.14,1.19,1,1,1,1,1,1,1,1,1.38,1,1,1,1,1.21,1.17
inc_pct,15.4,0,0,20,11.8,0,0,0,0,0,0,0,0,50,0,0,0,0,30.8,33.3
weekday_pct,46.2,50,100,80,82.4,100,100,100,100,0,100,100,100,50,100,100,50,95.2,80.8,66.7
fancy_car_user,True,False,False,True,False,True,False,False,False,False,True,False,False,False,False,True,True,True,True,False
city_Carthag,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
city_Harko,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1
phone_iPhone,1,0,1,1,0,1,0,1,0,0,1,1,1,1,0,0,0,1,1,1


## Preparing data for XGBoost

In [5]:
X, y = churn_data.iloc[:, :-1], churn_data.iloc[:, -1]
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [6]:
# Create the DMatrix: churn_dmatrix
churn_dmatrix = xgb.DMatrix(data=X, label=y)

In [7]:
# Create the parameter dictionary: params
params = {"objective":"reg:logistic", "max_depth":3}

## Cross-validation

In [8]:
# Perform cross-validation: cv_results
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, nfold=3, 
                    num_boost_round=10, metrics="error", as_pandas=True, seed=123)
cv_results

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.28232,0.002366,0.28378,0.001932
1,0.26951,0.001855,0.2719,0.001932
2,0.25605,0.003213,0.25798,0.003963
3,0.2509,0.001845,0.25434,0.003827
4,0.24654,0.001981,0.24852,0.000934
5,0.24652,0.001983,0.2467,0.001329
6,0.24495,0.000767,0.2474,0.003219
7,0.24384,0.000875,0.24524,0.002079
8,0.24373,0.000605,0.24462,0.00244
9,0.24256,0.000181,0.244,0.002309


## Metric: accuracy

In [9]:
# Compute the accuracy: accuracy
accuracy = 1-cv_results["test-error-mean"].iloc[-1]
print("Accuracy: {:.02f}%".format(accuracy*100))

Accuracy: 75.60%


## Metric: AUC

In [10]:
# Perform cross-validation: cv_results
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, nfold=3, 
                    num_boost_round=10, metrics=["error","auc"], as_pandas=True, seed=123)
cv_results

Unnamed: 0,train-auc-mean,train-auc-std,train-error-mean,train-error-std,test-auc-mean,test-auc-std,test-error-mean,test-error-std
0,0.768893,0.001544,0.28232,0.002366,0.767863,0.00282,0.28378,0.001932
1,0.790864,0.006758,0.26951,0.001855,0.789157,0.006846,0.2719,0.001932
2,0.815872,0.0039,0.25605,0.003213,0.814476,0.005997,0.25798,0.003963
3,0.822959,0.002018,0.2509,0.001845,0.821682,0.003912,0.25434,0.003827
4,0.827528,0.000769,0.24654,0.001981,0.826191,0.001937,0.24852,0.000934
5,0.830345,0.000888,0.24652,0.001983,0.82912,0.002825,0.2467,0.001329
6,0.83311,0.000389,0.24495,0.000767,0.831687,0.002505,0.2474,0.003219
7,0.835571,0.001134,0.24384,0.000875,0.833566,0.002533,0.24524,0.002079
8,0.836445,0.000571,0.24373,0.000605,0.834163,0.001907,0.24462,0.00244
9,0.837772,0.000498,0.24256,0.000181,0.835314,0.001891,0.244,0.002309


In [11]:
# Print the AUC
auc = cv_results["test-auc-mean"].iloc[-1]
print("AUC: {:.02f}".format(auc))

AUC: 0.84
