# XG-Boost

In [169]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier

In [170]:
df = pd.read_csv('Churn_Modelling.csv')

In [171]:
print(f'{df.head()}')
print('\n-------------------------------------------------------------------\n')
print(f'{df.describe()}')
print('\n-------------------------------------------------------------------\n')
print(f'{df.info()}')
print('\n-------------------------------------------------------------------\n')
print(f'{df.columns}')

   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

In [182]:
X = df.loc[:, ['CreditScore', 'Geography', 'Gender',
               'Age', 'Tenure', 'Balance', 'NumOfProducts',
               'HasCrCard', 'IsActiveMember', 'EstimatedSalary']].values
y = df.loc[:, 'Exited'].values

### Encoding categorical data

In [183]:
labelencoder_X_1 = LabelEncoder()
labelencoder_X_2 = LabelEncoder()
onehotencoder = OneHotEncoder(categorical_features=[1])

In [184]:
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])

In [185]:
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


### Splitting the dataset into the Training set and Test set

In [186]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=12)

### Fitting XGBoost to the Training set

In [187]:
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

### Predicting the Test set results

In [188]:
y_pred = classifier.predict(X_test)
print(y_pred)

[0 0 0 ... 0 0 0]


### Making the Confusion Matrix

In [189]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[1517   47]
 [ 229  207]]


### Applying k-Fold Cross Validation

In [190]:
accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10, verbose=2, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.5s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.7s finished


In [191]:
print('Accuracies:\n')
[print(num*100) for num in accuracies]
print('\n---------------------------------------------------\n')
print(f'Accuracies mean: {accuracies.mean()*100}%')
print('\n---------------------------------------------------\n')
print(f'Accuracies standard deviation: {accuracies.std()}')

Accuracies:

86.3920099875156
87.5
86.875
85.875
85.625
85.25
87.125
88.375
85.875
86.10763454317897

---------------------------------------------------

Accuracies mean: 86.49996445306944%

---------------------------------------------------

Accuracies standard deviation: 0.009122424284074334
