# Use Case - Churn Prediction

> Data Mining for Business Analytics ADS-505
>
> Fatemeh Khosravi - Stephen Kuc - Ruddy Simonpour

In [43]:
# import dependencies
import pandas as pd
import numpy as np
import csv
import os
import matplotlib.pylab as plt
import statsmodels.tools.tools as stattools

In [44]:
os.chdir("/Users/ruddysimonpour/Desktop/ADS505-Applied Data Sci for Business/Project-ADS505/ADS505-Finall-project/Dataset")

In [45]:
churn_df = pd.read_csv("Bank Customer Churn Prediction.csv",low_memory=False)

In [46]:
# print data
churn_df.sample(10)

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
5428,15623082,507,France,Female,35,2,0.0,2,1,0,97633.93,0
9307,15680405,685,France,Male,40,2,168001.34,2,1,1,167400.29,0
6726,15633213,628,Spain,Male,50,8,0.0,1,0,0,144366.83,1
8855,15689953,697,Spain,Male,43,10,128226.37,1,0,0,188486.94,0
1513,15729040,440,France,Male,42,2,0.0,2,1,0,49826.68,0
2384,15758531,732,France,Female,40,10,0.0,2,1,0,154189.08,0
2973,15651983,591,France,Female,56,9,128882.49,1,1,1,196241.94,1
7389,15676909,667,Spain,Female,34,5,0.0,2,1,0,163830.64,0
6647,15691627,713,France,Female,37,8,0.0,1,1,1,16403.41,0
3494,15568120,681,France,Female,37,7,69609.85,1,1,1,72127.83,0


In [47]:
# checking the data types
churn_df.dtypes

customer_id           int64
credit_score          int64
country              object
gender               object
age                   int64
tenure                int64
balance             float64
products_number       int64
credit_card           int64
active_member         int64
estimated_salary    float64
churn                 int64
dtype: object

In [48]:
# dataframe shape
print(f'Number of rows in the dataframe {churn_df.shape[0]}')
print(f'Number of columns in the dataframe {churn_df.shape[1]}')

Number of rows in the dataframe 10000
Number of columns in the dataframe 12


In [49]:
churn_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
customer_id,10000.0,15690940.0,71936.186123,15565701.0,15628528.25,15690740.0,15753230.0,15815690.0
credit_score,10000.0,650.5288,96.653299,350.0,584.0,652.0,718.0,850.0
age,10000.0,38.9218,10.487806,18.0,32.0,37.0,44.0,92.0
tenure,10000.0,5.0128,2.892174,0.0,3.0,5.0,7.0,10.0
balance,10000.0,76485.89,62397.405202,0.0,0.0,97198.54,127644.2,250898.09
products_number,10000.0,1.5302,0.581654,1.0,1.0,1.0,2.0,4.0
credit_card,10000.0,0.7055,0.45584,0.0,0.0,1.0,1.0,1.0
active_member,10000.0,0.5151,0.499797,0.0,0.0,1.0,1.0,1.0
estimated_salary,10000.0,100090.2,57510.492818,11.58,51002.11,100193.9,149388.2,199992.48
churn,10000.0,0.2037,0.402769,0.0,0.0,0.0,0.0,1.0


In [50]:
# checking for missing values
churn_df.isnull().sum()

customer_id         0
credit_score        0
country             0
gender              0
age                 0
tenure              0
balance             0
products_number     0
credit_card         0
active_member       0
estimated_salary    0
churn               0
dtype: int64

There is no missing values in the dataset

In [51]:
# checking for duplicated values
duplicate_values = churn_df.duplicated().sum()
print(f'The number of duplicated values are : {duplicate_values}')

The number of duplicated values are : 0


In [52]:
# check number of unique values for each column
churn_df.nunique()

customer_id         10000
credit_score          460
country                 3
gender                  2
age                    70
tenure                 11
balance              6382
products_number         4
credit_card             2
active_member           2
estimated_salary     9999
churn                   2
dtype: int64

### Data Cleaning

There is not missing values and duplicated values in the dataset. Now we can drop unnecessary columns. "Customer-ID"s are assigned at random and are not based on anything. Therefore, we can remove this column

In [53]:
# dropping unnecessary variables
churn_df = churn_df.drop(["customer_id"],axis=1)

In [60]:
# Encoding categorical variables using get_dummies
churn_df = pd.get_dummies(churn_df)
churn_df.dtypes

credit_score          int64
age                   int64
tenure                int64
balance             float64
products_number       int64
credit_card           int64
active_member         int64
estimated_salary    float64
churn                 int64
country_France        uint8
country_Germany       uint8
country_Spain         uint8
gender_Female         uint8
gender_Male           uint8
dtype: object

In [61]:
# change the data type of the dummy variables (uint8 --> int64)
def data_type_change(df,cols):
    df[cols]=df[cols].astype(int)
    return df

In [63]:
churn_df = data_type_change(churn_df,["country_France","country_Germany","country_Spain",
                                     "gender_Female","gender_Male"])

Unnamed: 0,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn,country_France,country_Germany,country_Spain,gender_Female,gender_Male
0,619,42,2,0.00,1,1,1,101348.88,1,1,0,0,1,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,0,1,1,0
2,502,42,8,159660.80,3,1,0,113931.57,1,1,0,0,1,0
3,699,39,1,0.00,2,0,0,93826.63,0,1,0,0,1,0
4,850,43,2,125510.82,1,1,1,79084.10,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,39,5,0.00,2,1,0,96270.64,0,1,0,0,0,1
9996,516,35,10,57369.61,1,1,1,101699.77,0,1,0,0,0,1
9997,709,36,7,0.00,1,0,1,42085.58,1,1,0,0,1,0
9998,772,42,3,75075.31,2,1,0,92888.52,1,0,1,0,0,1
