## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Modelling
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from xgboost import XGBClassifier
# Evaluation
from sklearn.metrics import precision_score, recall_score, f1_score


import warnings

**Connect to kaggle**

In [2]:
!mkdir ~/.kaggle

In [3]:
!cp kaggle.json ~/.kaggle/

In [4]:
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle datasets download "gauravtopre/bank-customer-churn-dataset"

Downloading bank-customer-churn-dataset.zip to /content
  0% 0.00/187k [00:00<?, ?B/s]
100% 187k/187k [00:00<00:00, 126MB/s]


In [6]:
!unzip /content/bank-customer-churn-dataset.zip

Archive:  /content/bank-customer-churn-dataset.zip
  inflating: Bank Customer Churn Prediction.csv  


**Read Data**

In [7]:
data = pd.read_csv("/content/Bank Customer Churn Prediction.csv")

#### Show Top 5 Records

In [8]:
data.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Dataset preparation

In [9]:
data.drop("customer_id", axis=1, inplace=True)
data.head()

Unnamed: 0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


#### Preparing X and Y variables

In [10]:
X = data.drop(columns=["churn"],axis=1)

In [11]:
X.head()

Unnamed: 0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,0,113931.57
3,699,France,Female,39,1,0.0,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


In [12]:
# define numerical & categorical columns
numeric_features = [feature for feature in data.columns if data[feature].dtype != 'O']
categorical_features = [feature for feature in data.columns if data[feature].dtype == 'O']

# print columns
print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))

We have 9 numerical features : ['credit_score', 'age', 'tenure', 'balance', 'products_number', 'credit_card', 'active_member', 'estimated_salary', 'churn']

We have 2 categorical features : ['country', 'gender']


In [13]:
print(f"Categories in 'country' variable: {data['country'].unique()}")

print(f"Categories in 'gender' variable:  {data['gender'].unique()}")

Categories in 'country' variable: ['France' 'Spain' 'Germany']
Categories in 'gender' variable:  ['Female' 'Male']


In [14]:
y = data['churn']

In [15]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),
    ]
)

In [16]:
X = preprocessor.fit_transform(X)

In [17]:
X.shape

(10000, 13)

### Separate dataset into train and test

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((8000, 13), (2000, 13))

#### Create an Evaluate Function to give all metrics after model Training

**Metrics selected :**
- Precision : It is a measure of how many of the predicted positive instances are actually positive.
- Recall (sensitivity) : It is a measure of how many of the actual positive instances were captured by the model.
- F1 score : Is a metric that combines both precision and recall into a single value, providing a balance between the two.

**In our study, False Negatives are Costly (not predicting customer churn), so I'll focus on recall.**

In [19]:
def evaluate_model(true, predicted):
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    F1_score = f1_score(true, predicted)
    return precision, recall, F1_score

In [20]:
models = {
    "SVC": SVC(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "XGBClassifier": XGBClassifier(),
}

model_list = []
recall_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate Train and Test dataset
    model_train_precision, model_train_recall , model_train_F1_score = evaluate_model(y_train, y_train_pred)

    model_test_precision, model_test_recall , model_test_F1_score = evaluate_model(y_test, y_test_pred)


    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Precision score: {:.4f}".format(model_train_precision))
    print("- Recall Score: {:.4f}".format(model_train_recall))
    print("- Mean Absolute Error: {:.4f}".format(model_train_F1_score))

    print('----------------------------------')

    print('Model performance for Test set')
    print("- Precision score: {:.4f}".format(model_test_precision))
    print("- Recall Score: {:.4f}".format(model_test_recall))
    print("- F1_score: {:.4f}".format(model_test_F1_score))

    recall_list.append(model_test_recall)

    print('='*35)
    print('\n')

SVC
Model performance for Training set
- Precision score: 0.8513
- Recall Score: 0.4039
- Mean Absolute Error: 0.5479
----------------------------------
Model performance for Test set
- Precision score: 0.7892
- Recall Score: 0.3715
- F1_score: 0.5052


K-Neighbors Classifier
Model performance for Training set
- Precision score: 0.8159
- Recall Score: 0.5310
- Mean Absolute Error: 0.6433
----------------------------------
Model performance for Test set
- Precision score: 0.6320
- Recall Score: 0.4020
- F1_score: 0.4914


AdaBoost Classifier
Model performance for Training set
- Precision score: 0.7354
- Recall Score: 0.4836
- Mean Absolute Error: 0.5835
----------------------------------
Model performance for Test set
- Precision score: 0.6957
- Recall Score: 0.4885
- F1_score: 0.5740


Random Forest Classifier
Model performance for Training set
- Precision score: 1.0000
- Recall Score: 0.9988
- Mean Absolute Error: 0.9994
----------------------------------
Model performance for Test se

### Results

In [21]:
pd.DataFrame(list(zip(model_list, recall_list)), columns=['Model Name', 'Recall']).sort_values(by=["Recall"],ascending=False)

Unnamed: 0,Model Name,Recall
5,XGBClassifier,0.513995
4,Decision Tree,0.491094
2,AdaBoost Classifier,0.48855
3,Random Forest Classifier,0.473282
1,K-Neighbors Classifier,0.402036
0,SVC,0.371501
