In [1]:
import pandas as pd

In [2]:
import numpy as np

# Churn modeling with Decision tree 

In [3]:
df = pd.read_csv('/Users/manu/Downloads/Churn_Modelling.csv')

In [4]:
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


### Observation 1

1.1 Columns like RowNumber, CustomerId and Surname is not needed for model building as they do not give a statistical intuition.<br/> 
1.2 Column Exited is our target or dependent feature.

In [6]:
df.shape

(10000, 14)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


### Observation 2

2.1 Columns like 'Geography' & 'Gender' are object data-type and needs to be converted into numerical data-type.

In [8]:
feature_names = list(df.columns.values)

In [9]:
feature_names

['RowNumber',
 'CustomerId',
 'Surname',
 'CreditScore',
 'Geography',
 'Gender',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary',
 'Exited']

In [15]:
for feature in feature_names:
    if feature == 'Gender':
        Gender_dummies = pd.get_dummies(df['Gender'], drop_first = True)
    elif feature == 'Geography':
        Geography_dummies = pd.get_dummies(df['Geography'], drop_first = True)
    else:
        pass

In [16]:
Gender_dummies

Unnamed: 0,Male
0,0
1,0
2,0
3,0
4,0
...,...
9995,1
9996,1
9997,0
9998,1


In [17]:
Geography_dummies

Unnamed: 0,Germany,Spain
0,0,0
1,0,1
2,0,0
3,0,0
4,0,1
...,...,...
9995,0,0
9996,0,0
9997,0,0
9998,1,0


In [18]:
df1 = df.copy()

In [19]:
df1 = pd.concat([df1, Gender_dummies, Geography_dummies], axis = 1)

In [20]:
df1.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Male,Germany,Spain
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,0,0,1
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,0,0,0


In [22]:
df1 = df1.drop(['RowNumber', 'CustomerId', 'Surname', 'Geography', 'Gender'], axis = 1)

### Observation 3

3.1 Dropping all the columns which have 'string' data type or are irrelevant from modeling POV

In [23]:
df1.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Male,Germany,Spain
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,0,1
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,0,1


In [25]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Age              10000 non-null  int64  
 2   Tenure           10000 non-null  int64  
 3   Balance          10000 non-null  float64
 4   NumOfProducts    10000 non-null  int64  
 5   HasCrCard        10000 non-null  int64  
 6   IsActiveMember   10000 non-null  int64  
 7   EstimatedSalary  10000 non-null  float64
 8   Exited           10000 non-null  int64  
 9   Male             10000 non-null  uint8  
 10  Germany          10000 non-null  uint8  
 11  Spain            10000 non-null  uint8  
dtypes: float64(2), int64(7), uint8(3)
memory usage: 732.5 KB


In [26]:
# splitting the data set into depenedent and independent feature

x = df1.drop(['Exited'], axis = 1)

In [27]:
y = df1.Exited

In [28]:
y

0       1
1       0
2       1
3       0
4       0
       ..
9995    0
9996    0
9997    1
9998    1
9999    0
Name: Exited, Length: 10000, dtype: int64

In [29]:
from sklearn.preprocessing import StandardScaler

In [30]:
std_scaler = StandardScaler()

In [31]:
x = std_scaler.fit_transform(x)

In [32]:
x

array([[-0.32622142,  0.29351742, -1.04175968, ..., -1.09598752,
        -0.57873591, -0.57380915],
       [-0.44003595,  0.19816383, -1.38753759, ..., -1.09598752,
        -0.57873591,  1.74273971],
       [-1.53679418,  0.29351742,  1.03290776, ..., -1.09598752,
        -0.57873591, -0.57380915],
       ...,
       [ 0.60498839, -0.27860412,  0.68712986, ..., -1.09598752,
        -0.57873591, -0.57380915],
       [ 1.25683526,  0.29351742, -0.69598177, ...,  0.91241915,
         1.72790383, -0.57380915],
       [ 1.46377078, -1.04143285, -0.35020386, ..., -1.09598752,
        -0.57873591, -0.57380915]])

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 1)

In [35]:
from sklearn.tree import DecisionTreeClassifier

In [36]:
dtc = DecisionTreeClassifier()

In [37]:
dtc.fit(x_train, y_train)

In [38]:
from sklearn.metrics import accuracy_score

In [39]:
y_predicted = dtc.predict(x_test)

In [40]:
accuracy_score(y_test, y_predicted)

0.7968

In [41]:
para_dict = {
    'criterion':['gini', 'entropy'],
    'max_depth' : [1, 2, 3, 4, 5, 6, 7, None]
    
}

In [42]:
from sklearn.model_selection import GridSearchCV

In [45]:
gsCV = GridSearchCV(dtc, param_grid = para_dict, cv = 10, n_jobs = -1)

In [46]:
gsCV.fit(x_train, y_train)

In [47]:
gsCV.best_estimator_

In [48]:
gsCV.best_score_

0.8566666666666667

In [50]:
para_dict_2 = {
    'criterion':['gini', 'entropy'],
    'max_depth' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, None]
    
}

In [51]:
gsCV = GridSearchCV(dtc, param_grid = para_dict_2, cv = 10, n_jobs = -1)

In [52]:
gsCV.fit(x_train, y_train)

In [53]:
gsCV.best_estimator_

In [54]:
gsCV.best_score_

0.8569333333333333

In [55]:
gsCV.best_params_

{'criterion': 'entropy', 'max_depth': 7}