In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Churn_Modelling.csv")

In [3]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
#there are no missing values in the data

In [5]:
data = df.copy()

In [6]:
#RowNumber, CustomerId, Surname does not play any role in churn prediction, So we will drop these columns

In [7]:
data = data.drop("RowNumber", axis=1)

In [8]:
data = data.drop("CustomerId", axis=1)

In [9]:
data = data.drop("Surname", axis=1)


In [10]:
data["Age"].value_counts()

Age
37    478
38    477
35    474
36    456
34    447
     ... 
92      2
82      1
88      1
85      1
83      1
Name: count, Length: 70, dtype: int64

In [11]:
data["Age_cat"] = pd.cut(data["Age"], bins=[15,25,35,45,55,65,75,85,95], labels=[1, 2, 3, 4, 5, 6, 7, 8])

In [12]:
#pd.set_option('display.max_rows', None)

In [13]:
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Age_cat
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1,3
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,3
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,3
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0,3
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,3


In [14]:
from sklearn.model_selection import StratifiedShuffleSplit

In [15]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(data, data["Age_cat"]):
    train_set = data.loc[train_index]
    test_set = data.loc[test_index]

In [16]:
for sett in(train_set, test_set):
    sett.drop("Age_cat", axis=1, inplace=True)

In [17]:
train_set.shape

(8000, 11)

In [18]:
test_set.shape

(2000, 11)

In [19]:
churn = train_set.copy()

In [20]:
churn.columns

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')

In [21]:
churn_feat = churn.drop("Exited", axis=1)
churn_label = train_set["Exited"].copy()

In [22]:
churn_feat

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
6374,503,France,Male,38,1,0.00,2,1,1,95153.24
7248,605,Spain,Female,57,2,0.00,3,1,0,66652.75
7848,433,France,Female,49,10,0.00,1,1,1,87711.61
5780,604,Germany,Female,42,10,166031.45,1,1,0,98293.14
4561,562,Germany,Male,31,4,127237.25,2,0,1,143317.42
...,...,...,...,...,...,...,...,...,...,...
2022,501,Germany,Male,24,4,130806.42,2,1,0,80241.14
62,555,Spain,Male,33,1,56084.69,2,0,0,178798.13
9216,724,France,Male,30,10,0.00,2,1,1,54265.55
4488,716,France,Female,44,6,155114.90,1,0,0,133871.83


In [23]:
churn_label

6374    0
7248    1
7848    0
5780    0
4561    0
       ..
2022    0
62      0
9216    0
4488    0
5242    1
Name: Exited, Length: 8000, dtype: int64

In [24]:
from sklearn.preprocessing import OneHotEncoder

In [25]:
set(churn_feat["Geography"])

{'France', 'Germany', 'Spain'}

In [26]:
geo_cat = churn_feat["Geography"]

In [27]:
onehot_encoder = OneHotEncoder()
geo_cat = onehot_encoder.fit_transform(geo_cat.to_frame())

In [28]:
geo_cat

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 8000 stored elements and shape (8000, 3)>

In [29]:
geo_cat.toarray()

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.]], shape=(8000, 3))

In [30]:
gender_cat = churn_feat["Gender"]

In [31]:
gender_cat

6374      Male
7248    Female
7848    Female
5780    Female
4561      Male
         ...  
2022      Male
62        Male
9216      Male
4488    Female
5242    Female
Name: Gender, Length: 8000, dtype: object

In [32]:
gender_cat = onehot_encoder.fit_transform(gender_cat.to_frame())

In [33]:
gender_cat

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 8000 stored elements and shape (8000, 2)>

In [34]:
gender_cat.toarray()

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [1., 0.]], shape=(8000, 2))

In [35]:
geo_cat = pd.DataFrame(geo_cat.toarray(), columns=['France', 'Germany', 'Spain'])

In [36]:
geo_cat

Unnamed: 0,France,Germany,Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,0.0,1.0,0.0
4,0.0,1.0,0.0
...,...,...,...
7995,0.0,1.0,0.0
7996,0.0,0.0,1.0
7997,1.0,0.0,0.0
7998,1.0,0.0,0.0


In [37]:
set(churn_feat["Gender"])

{'Female', 'Male'}

In [38]:
gender_cat = pd.DataFrame(gender_cat.toarray(), columns=['Female', 'Male'])

In [39]:
gender_cat

Unnamed: 0,Female,Male
0,0.0,1.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,0.0,1.0
...,...,...
7995,0.0,1.0
7996,0.0,1.0
7997,0.0,1.0
7998,1.0,0.0


In [40]:
print(f"churn_feat shape: {churn_feat.shape}")
print(f"geo_cat shape: {geo_cat.shape}")
print(f"gender_cat shape: {gender_cat.shape}")

churn_feat shape: (8000, 10)
geo_cat shape: (8000, 3)
gender_cat shape: (8000, 2)


In [41]:
churn_feat

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
6374,503,France,Male,38,1,0.00,2,1,1,95153.24
7248,605,Spain,Female,57,2,0.00,3,1,0,66652.75
7848,433,France,Female,49,10,0.00,1,1,1,87711.61
5780,604,Germany,Female,42,10,166031.45,1,1,0,98293.14
4561,562,Germany,Male,31,4,127237.25,2,0,1,143317.42
...,...,...,...,...,...,...,...,...,...,...
2022,501,Germany,Male,24,4,130806.42,2,1,0,80241.14
62,555,Spain,Male,33,1,56084.69,2,0,0,178798.13
9216,724,France,Male,30,10,0.00,2,1,1,54265.55
4488,716,France,Female,44,6,155114.90,1,0,0,133871.83


In [42]:
churn_feat = churn_feat.reset_index(drop=True)

In [43]:
churn_feat = pd.concat([churn_feat, geo_cat, gender_cat], axis=1)

In [46]:
churn_feat = churn_feat.drop('Geography', axis=1)

In [47]:
churn_feat = churn_feat.drop('Gender', axis=1)

In [48]:
churn_feat

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,France,Germany,Spain,Female,Male
0,503,38,1,0.00,2,1,1,95153.24,1.0,0.0,0.0,0.0,1.0
1,605,57,2,0.00,3,1,0,66652.75,0.0,0.0,1.0,1.0,0.0
2,433,49,10,0.00,1,1,1,87711.61,1.0,0.0,0.0,1.0,0.0
3,604,42,10,166031.45,1,1,0,98293.14,0.0,1.0,0.0,1.0,0.0
4,562,31,4,127237.25,2,0,1,143317.42,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,501,24,4,130806.42,2,1,0,80241.14,0.0,1.0,0.0,0.0,1.0
7996,555,33,1,56084.69,2,0,0,178798.13,0.0,0.0,1.0,0.0,1.0
7997,724,30,10,0.00,2,1,1,54265.55,1.0,0.0,0.0,0.0,1.0
7998,716,44,6,155114.90,1,0,0,133871.83,1.0,0.0,0.0,1.0,0.0
