# Problem description

**Context**
---
Churn prediction means detecting which customers are likely to cancel a subscription to a service based on how they use the service. It is a critical prediction for many businesses because acquiring new clients often costs more than retaining existing ones.

**Objective**
---

To build a model which would predict this user have exited or not

**Features**
---
  
 0.   RowNumber
 1.   CustomerId
 2.   Surname
 3.   CreditScore
 4.   Geography
 5.   Gender
 6.   Age
 7.   Tenure
 8.   Balance
 9.   NumOfProducts
 10.  HasCrCard
 11.  IsActiveMember
 12.  EstimatedSalary


**Label**
---

 1.  Exited  


# **Reading dataset**

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('Churn_Modelling.csv')

# **Feature engineering**

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [6]:
df.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [7]:
# remove identifier features
df.drop(labels=['RowNumber',	'CustomerId',	'Surname'], axis=1, inplace=True)

In [8]:
df.head(5)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [9]:
# label encoding of Gender
# One hot encoding of Geography
df = pd.concat([df,
           pd.get_dummies(df['Gender'], prefix='Gender', drop_first=True),
           pd.get_dummies(df['Geography'], prefix='Geography')], axis=1)
df = df.drop(columns=['Gender', 'Geography'])

In [10]:
y = df['Exited']

In [11]:
X = df.drop(columns='Exited')

In [14]:
# imbalance
y.value_counts(normalize=True)

0    0.7963
1    0.2037
Name: Exited, dtype: float64

# **Splitting dataset**

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=42,)

# **Over sampling**

In [17]:
from imblearn.over_sampling import RandomOverSampler


In [18]:
oversample = RandomOverSampler()

In [19]:
X_train, y_train = oversample.fit_resample(X_train, y_train)

# **Feature Standardization**

In [20]:
from sklearn.preprocessing import StandardScaler

In [21]:
scaler = StandardScaler()

In [22]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Evaluating

In [23]:
from sklearn.metrics import classification_report

In [24]:
def compute_classification_report(y_true, y_pred):
  target_names = ['No', 'Yes']
  print(classification_report(y_true, y_pred, target_names=target_names, digits=4))

# **KNN**

In [25]:
from sklearn.neighbors import KNeighborsClassifier

In [26]:
clf = KNeighborsClassifier(n_neighbors=13, p=2)

In [27]:
clf.fit(X_train, y_train)

In [28]:
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [29]:
compute_classification_report(y_train_pred, y_train)

              precision    recall  f1-score   support

          No     0.7912    0.8354    0.8127      6033
         Yes     0.8441    0.8017    0.8224      6707

    accuracy                         0.8177     12740
   macro avg     0.8177    0.8186    0.8175     12740
weighted avg     0.8191    0.8177    0.8178     12740



In [30]:
compute_classification_report(y_test_pred, y_test)

              precision    recall  f1-score   support

          No     0.7558    0.9005    0.8218      1337
         Yes     0.6732    0.4133    0.5121       663

    accuracy                         0.7390      2000
   macro avg     0.7145    0.6569    0.6670      2000
weighted avg     0.7284    0.7390    0.7192      2000



# **Naive bayes**

In [31]:
from sklearn.naive_bayes import GaussianNB

In [32]:
clf = GaussianNB()

In [33]:
clf.fit(X_train, y_train)

In [34]:
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [35]:
compute_classification_report(y_train_pred, y_train)

              precision    recall  f1-score   support

          No     0.7435    0.6917    0.7167      6847
         Yes     0.6686    0.7227    0.6946      5893

    accuracy                         0.7060     12740
   macro avg     0.7060    0.7072    0.7056     12740
weighted avg     0.7088    0.7060    0.7065     12740



In [36]:
compute_classification_report(y_test_pred, y_test)

              precision    recall  f1-score   support

          No     0.7238    0.8924    0.7993      1292
         Yes     0.6585    0.3785    0.4807       708

    accuracy                         0.7105      2000
   macro avg     0.6911    0.6355    0.6400      2000
weighted avg     0.7007    0.7105    0.6865      2000



# **SVM**

In [37]:
from sklearn.svm import SVC

In [38]:
clf = SVC()

In [39]:
clf.fit(X_train, y_train)

In [40]:
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [41]:
compute_classification_report(y_train_pred, y_train)

              precision    recall  f1-score   support

          No     0.8176    0.8191    0.8184      6358
         Yes     0.8195    0.8179    0.8187      6382

    accuracy                         0.8185     12740
   macro avg     0.8185    0.8185    0.8185     12740
weighted avg     0.8185    0.8185    0.8185     12740



In [42]:
compute_classification_report(y_test_pred, y_test)

              precision    recall  f1-score   support

          No     0.7972    0.9263    0.8570      1371
         Yes     0.7518    0.4865    0.5907       629

    accuracy                         0.7880      2000
   macro avg     0.7745    0.7064    0.7238      2000
weighted avg     0.7830    0.7880    0.7732      2000



# **DT**

In [43]:
from sklearn.tree import DecisionTreeClassifier

In [49]:
clf = DecisionTreeClassifier(criterion='gini',
                             splitter='best',
                             max_depth=3,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             random_state=0)

In [50]:
clf.fit(X_train, y_train)

In [51]:
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [52]:
compute_classification_report(y_train_pred, y_train)

              precision    recall  f1-score   support

          No     0.8209    0.6887    0.7490      7593
         Yes     0.6289    0.7783    0.6957      5147

    accuracy                         0.7249     12740
   macro avg     0.7249    0.7335    0.7223     12740
weighted avg     0.7433    0.7249    0.7274     12740



In [53]:
compute_classification_report(y_test_pred, y_test)

              precision    recall  f1-score   support

          No     0.8016    0.8993    0.8477      1420
         Yes     0.6486    0.4552    0.5350       580

    accuracy                         0.7705      2000
   macro avg     0.7251    0.6772    0.6913      2000
weighted avg     0.7573    0.7705    0.7570      2000



### Ensemble Techniques

# **RF**

In [55]:
from sklearn.ensemble import RandomForestClassifier

In [66]:
clf = RandomForestClassifier(n_estimators=30, 
                             criterion='gini',
                             max_depth=6,
                             min_samples_split=8,
                             min_samples_leaf=2,
                             random_state=0)

In [67]:
clf.fit(X_train, y_train)

In [68]:
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [69]:
compute_classification_report(y_train_pred, y_train)

              precision    recall  f1-score   support

          No     0.8276    0.7830    0.8047      6733
         Yes     0.7706    0.8172    0.7932      6007

    accuracy                         0.7991     12740
   macro avg     0.7991    0.8001    0.7990     12740
weighted avg     0.8008    0.7991    0.7993     12740



In [70]:
compute_classification_report(y_test_pred, y_test)

              precision    recall  f1-score   support

          No     0.8073    0.9212    0.8605      1396
         Yes     0.7297    0.4917    0.5875       604

    accuracy                         0.7915      2000
   macro avg     0.7685    0.7065    0.7240      2000
weighted avg     0.7839    0.7915    0.7781      2000



# **AdaBoostClassifier**

In [71]:
from sklearn.ensemble import AdaBoostClassifier

In [77]:
clf = AdaBoostClassifier(n_estimators=70, learning_rate=0.5, random_state=0)

In [78]:
clf.fit(X_train, y_train)

In [79]:
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [80]:
compute_classification_report(y_train_pred, y_train)

              precision    recall  f1-score   support

          No     0.7915    0.7595    0.7752      6639
         Yes     0.7493    0.7823    0.7655      6101

    accuracy                         0.7704     12740
   macro avg     0.7704    0.7709    0.7703     12740
weighted avg     0.7713    0.7704    0.7705     12740



In [81]:
compute_classification_report(y_test_pred, y_test)

              precision    recall  f1-score   support

          No     0.7884    0.9256    0.8515      1357
         Yes     0.7518    0.4759    0.5829       643

    accuracy                         0.7810      2000
   macro avg     0.7701    0.7007    0.7172      2000
weighted avg     0.7767    0.7810    0.7651      2000



# **Gradient boosting**

In [82]:
from sklearn.ensemble import GradientBoostingClassifier

In [93]:
clf = GradientBoostingClassifier(
    learning_rate=0.3,
    n_estimators=100,
    max_depth=4,
    random_state=0
)

In [94]:
clf.fit(X_train, y_train)

In [95]:
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [96]:
compute_classification_report(y_train_pred, y_train)

              precision    recall  f1-score   support

          No     0.8970    0.9135    0.9052      6255
         Yes     0.9151    0.8988    0.9069      6485

    accuracy                         0.9060     12740
   macro avg     0.9060    0.9062    0.9060     12740
weighted avg     0.9062    0.9060    0.9061     12740



In [97]:
compute_classification_report(y_test_pred, y_test)

              precision    recall  f1-score   support

          No     0.8481    0.9166    0.8810      1474
         Yes     0.6978    0.5399    0.6088       526

    accuracy                         0.8175      2000
   macro avg     0.7729    0.7282    0.7449      2000
weighted avg     0.8086    0.8175    0.8094      2000



# **XGBoost**

In [98]:
import xgboost as xgb

In [99]:
clf = xgb.XGBClassifier(learning_rate=0.1,
                        max_depth=3,
                        n_estimators=100, 
                        random_state=0)


In [100]:
clf.fit(X_train, y_train)

In [101]:
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [102]:
compute_classification_report(y_train_pred, y_train)

              precision    recall  f1-score   support

          No     0.8177    0.7885    0.8029      6606
         Yes     0.7807    0.8107    0.7954      6134

    accuracy                         0.7992     12740
   macro avg     0.7992    0.7996    0.7991     12740
weighted avg     0.7999    0.7992    0.7993     12740



In [103]:
compute_classification_report(y_test_pred, y_test)

              precision    recall  f1-score   support

          No     0.8016    0.9314    0.8617      1371
         Yes     0.7690    0.4976    0.6042       629

    accuracy                         0.7950      2000
   macro avg     0.7853    0.7145    0.7330      2000
weighted avg     0.7914    0.7950    0.7807      2000

