In [14]:
from pycaret.classification import *

import pandas as pd

In [2]:
data = pd.read_csv("Churn_Modelling.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


## Pre-processing 

PyCaret supports a simple setup module that applies 25+ pre-processing techniques

In [4]:
s = setup(data, target = 'Exited', ignore_features = ['CustomerId', 'Surname', 'RowNumber'])

Unnamed: 0,Description,Value
0,session_id,8750
1,Target,Exited
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(10000, 14)"
5,Missing Values,False
6,Numeric Features,4
7,Categorical Features,6
8,Ordinal Features,False
9,High Cardinality Features,False


## Modelling

In [5]:
best_model = compare_models(sort='Recall')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.7911,0.6883,0.514,0.4902,0.5016,0.3696,0.3699,0.007
lightgbm,Light Gradient Boosting Machine,0.8594,0.8585,0.4986,0.7306,0.5914,0.5104,0.5246,0.107
ada,Ada Boost Classifier,0.853,0.8466,0.4715,0.7145,0.5674,0.4834,0.4989,0.043
gbc,Gradient Boosting Classifier,0.8623,0.8665,0.4687,0.7708,0.5809,0.5046,0.5279,0.121
rf,Random Forest Classifier,0.8616,0.8535,0.463,0.7709,0.578,0.5013,0.5249,0.111
et,Extra Trees Classifier,0.8441,0.8303,0.4352,0.6894,0.5331,0.4454,0.4626,0.11
lda,Linear Discriminant Analysis,0.8367,0.8305,0.3529,0.6999,0.4682,0.3844,0.4162,0.007
ridge,Ridge Classifier,0.832,0.0,0.2594,0.7642,0.3863,0.3158,0.3784,0.005
svm,SVM - Linear Kernel,0.7458,0.0,0.103,0.0927,0.0731,0.0112,0.0149,0.022
nb,Naive Bayes,0.7841,0.7489,0.0767,0.3699,0.1268,0.061,0.086,0.004


Creating a model in addition to 10 folds cross validation

In [6]:
GBC = create_model('gbc')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8686,0.8744,0.4615,0.8148,0.5893,0.5181,0.5478
1,0.8586,0.8679,0.5035,0.72,0.5926,0.5102,0.5222
2,0.8586,0.8707,0.4615,0.75,0.5714,0.4924,0.5133
3,0.8614,0.8578,0.3846,0.8594,0.5314,0.4636,0.5154
4,0.8657,0.872,0.4965,0.7634,0.6017,0.5253,0.5429
5,0.8543,0.8481,0.4653,0.7283,0.5678,0.4852,0.5029
6,0.8371,0.8248,0.3819,0.6875,0.4911,0.4034,0.4281
7,0.8857,0.8974,0.5486,0.8404,0.6639,0.5986,0.6184
8,0.85,0.869,0.4306,0.7294,0.5415,0.4588,0.4817
9,0.8827,0.8832,0.5524,0.8144,0.6583,0.5906,0.6069


In [7]:
print(GBC)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=8750, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)


In [8]:
GBC.feature_importances_

array([2.41554756e-02, 3.53769389e-01, 8.43374038e-02, 1.84710985e-02,
       7.83434406e-04, 6.54149946e-02, 1.50088690e-03, 1.30579125e-02,
       4.79089499e-04, 4.49944142e-04, 2.34992325e-04, 4.10447539e-04,
       6.25664341e-04, 9.02744951e-04, 4.94936476e-04, 2.21589574e-05,
       1.00102160e-03, 0.00000000e+00, 7.66704292e-04, 4.26287709e-02,
       1.89832237e-01, 7.26428877e-02, 1.95798508e-02, 0.00000000e+00,
       1.08437954e-01])

It also supports randomized cross validation in hyperparameter tuning

In [9]:
GBC2 = tune_model(GBC)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8671,0.8699,0.4476,0.8205,0.5792,0.5083,0.5412
1,0.8629,0.8666,0.5175,0.7327,0.6066,0.5265,0.5381
2,0.8629,0.8651,0.4685,0.7701,0.5826,0.5063,0.5287
3,0.8629,0.8486,0.4056,0.8406,0.5472,0.4777,0.5219
4,0.8757,0.8619,0.5105,0.8111,0.6266,0.5566,0.5781
5,0.85,0.8396,0.4583,0.7097,0.557,0.4717,0.488
6,0.8286,0.8152,0.3819,0.6395,0.4783,0.3834,0.4017
7,0.8914,0.8958,0.5556,0.8696,0.678,0.6164,0.6388
8,0.8471,0.8634,0.4306,0.7126,0.5368,0.4519,0.4725
9,0.8798,0.8783,0.5175,0.8315,0.6379,0.5705,0.5936


### Ensemble

In [10]:
best3 = compare_models(n_select=3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8623,0.8665,0.4687,0.7708,0.5809,0.5046,0.5279,0.124
rf,Random Forest Classifier,0.8616,0.8535,0.463,0.7709,0.578,0.5013,0.5249,0.119
lightgbm,Light Gradient Boosting Machine,0.8594,0.8585,0.4986,0.7306,0.5914,0.5104,0.5246,0.066
ada,Ada Boost Classifier,0.853,0.8466,0.4715,0.7145,0.5674,0.4834,0.4989,0.044
et,Extra Trees Classifier,0.8441,0.8303,0.4352,0.6894,0.5331,0.4454,0.4626,0.11
lda,Linear Discriminant Analysis,0.8367,0.8305,0.3529,0.6999,0.4682,0.3844,0.4162,0.007
ridge,Ridge Classifier,0.832,0.0,0.2594,0.7642,0.3863,0.3158,0.3784,0.004
qda,Quadratic Discriminant Analysis,0.7951,0.0,0.0,0.0,0.0,0.0,0.0,0.015
dt,Decision Tree Classifier,0.7911,0.6883,0.514,0.4902,0.5016,0.3696,0.3699,0.008
lr,Logistic Regression,0.7891,0.6652,0.0523,0.3843,0.0918,0.0459,0.0758,0.013


Soft voting method

In [11]:
soft_voting = blend_models(best3)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8743,0.8794,0.4755,0.8395,0.6071,0.539,0.5699
1,0.8557,0.8692,0.4895,0.7143,0.5809,0.4974,0.5104
2,0.8643,0.8742,0.4965,0.7553,0.5992,0.5216,0.5383
3,0.86,0.8517,0.4126,0.8082,0.5463,0.4736,0.5111
4,0.8671,0.8687,0.4755,0.7907,0.5939,0.5203,0.5443
5,0.8557,0.8421,0.4653,0.7363,0.5702,0.4888,0.5074
6,0.84,0.8329,0.4097,0.686,0.513,0.4245,0.4447
7,0.8886,0.8951,0.5417,0.8667,0.6667,0.604,0.6281
8,0.8529,0.8689,0.4375,0.7412,0.5502,0.4692,0.4925
9,0.887,0.8758,0.5804,0.8137,0.6776,0.6113,0.6242


## Results & Accuracy

In [12]:
evaluate_model(soft_voting)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [13]:
evaluate_model(GBC)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

There are still other features supported by the library (predicting new data, connecting to a cloud server, saving and loading models, optimizing models) 