In [4]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

In [5]:
df = pd.read_csv('Churn_Modelling.csv')

In [6]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [7]:
df.tail()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.0,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.0,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1
9999,10000,15628319,Walker,792,France,Female,28,4,130142.79,1,1,0,38190.78,0


In [19]:
# creating x and y variables

x = df.select_dtypes(exclude='object').drop('Exited', axis=1)
print(x)

      RowNumber  CustomerId  CreditScore  Age  Tenure    Balance  \
0             1    15634602          619   42       2       0.00   
1             2    15647311          608   41       1   83807.86   
2             3    15619304          502   42       8  159660.80   
3             4    15701354          699   39       1       0.00   
4             5    15737888          850   43       2  125510.82   
...         ...         ...          ...  ...     ...        ...   
9995       9996    15606229          771   39       5       0.00   
9996       9997    15569892          516   35      10   57369.61   
9997       9998    15584532          709   36       7       0.00   
9998       9999    15682355          772   42       3   75075.31   
9999      10000    15628319          792   28       4  130142.79   

      NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  
0                 1          1               1        101348.88  
1                 1          0               1     

In [20]:
y = df['Exited']
print(y)

0       1
1       0
2       1
3       0
4       0
       ..
9995    0
9996    0
9997    1
9998    1
9999    0
Name: Exited, Length: 10000, dtype: int64


In [21]:
df['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

In [23]:
# decision tree classifier
model_dt = DecisionTreeClassifier(criterion='gini', random_state=100, max_depth=6, min_samples_leaf=8)

In [24]:
print(f"x_train data type: {type(x_train)}")
print(f"y_train data type: {type(y_train)}")

x_train data type: <class 'pandas.core.frame.DataFrame'>
y_train data type: <class 'pandas.core.series.Series'>


In [25]:
model_dt.fit(x_train, y_train)

In [26]:
y_pred = model_dt.predict(x_test)

In [27]:
y_pred

array([0, 1, 0, ..., 0, 0, 0])

In [28]:
model_dt.score(x_test, y_pred)

1.0

In [30]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.87      0.96      0.91      1596
           1       0.72      0.43      0.53       404

    accuracy                           0.85      2000
   macro avg       0.79      0.69      0.72      2000
weighted avg       0.84      0.85      0.83      2000



In [29]:
print(confusion_matrix(y_test, y_pred))

[[1529   67]
 [ 232  172]]


In [36]:
sm = SMOTEENN()
x_resample, y_resample = sm.fit_resample(x,y)

In [37]:
xr_train, xr_test, yr_train, yr_test = train_test_split(x_resample,y_resample, test_size = 0.2)

In [38]:
model = DecisionTreeClassifier(criterion='gini', random_state=100,max_depth=6, min_samples_leaf=8)

In [39]:
model.fit(xr_train, yr_train)

In [46]:
yr_pred = model.predict(xr_test)

In [47]:
print(classification_report(yr_test, yr_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.80      0.69      0.74       644
           1       0.82      0.89      0.85      1000

    accuracy                           0.81      1644
   macro avg       0.81      0.79      0.80      1644
weighted avg       0.81      0.81      0.81      1644

