In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

warnings.filterwarnings('ignore')

In [4]:
#load dataset
df = pd.read_csv(r"C:\Users\aniketh\Downloads\Churn\Churn_Modelling.csv")

In [5]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [6]:
df.shape

(10000, 14)

In [7]:
df.tail()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.0,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.0,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1
9999,10000,15628319,Walker,792,France,Female,28,4,130142.79,1,1,0,38190.78,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [10]:
#Frequency distribution of values in variables
#Now,we check the frequency counts of categorical variables.
col_names = ['CustomerId', 'CreditScore', 'Age', 'Balance', 'NumOfProducts', 'HasCrCard', 'EstimatedSalary']
for col in col_names:
    print(df[col].value_counts()) 

CustomerId
15634602    1
15667932    1
15766185    1
15667632    1
15599024    1
           ..
15599078    1
15702300    1
15660735    1
15671390    1
15628319    1
Name: count, Length: 10000, dtype: int64
CreditScore
850    233
678     63
655     54
705     53
667     53
      ... 
404      1
351      1
365      1
417      1
419      1
Name: count, Length: 460, dtype: int64
Age
37    478
38    477
35    474
36    456
34    447
     ... 
92      2
82      1
88      1
85      1
83      1
Name: count, Length: 70, dtype: int64
Balance
0.00         3617
130170.82       2
105473.74       2
85304.27        1
159397.75       1
             ... 
81556.89        1
112687.69       1
108698.96       1
238387.56       1
130142.79       1
Name: count, Length: 6382, dtype: int64
NumOfProducts
1    5084
2    4590
3     266
4      60
Name: count, dtype: int64
HasCrCard
1    7055
0    2945
Name: count, dtype: int64
EstimatedSalary
24924.92     2
101348.88    1
55313.44     1
72500.68     1
182692.80   

In [12]:
df['IsActiveMember'].value_counts()

IsActiveMember
1    5151
0    4849
Name: count, dtype: int64

In [13]:
df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [14]:
X = df.drop(['Exited'], axis=1)
y = df['Exited']

In [15]:
# split data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [16]:
X_train.shape, X_test.shape

((6700, 13), (3300, 13))

In [17]:
X_train.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
dtype: object

In [18]:
X_train.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
8371,8372,15661034,Ngozichukwuka,813,Germany,Female,29,5,106059.4,1,0,0,187976.88
5027,5028,15807989,Wall,681,Germany,Male,37,8,73179.34,2,1,1,25292.53
9234,9235,15766044,Cameron,642,Germany,Male,49,4,120688.61,1,1,0,24770.22
3944,3945,15794792,Golubev,612,France,Female,31,8,117989.76,1,1,1,54129.86
6862,6863,15736287,Piccio,586,France,Male,33,9,0.0,1,1,0,6975.02


In [22]:
!pip install --upgrade category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.4-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.4-py2.py3-none-any.whl (82 kB)
   ---------------------------------------- 0.0/82.0 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/82.0 kB ? eta -:--:--
   ---------------------------------------  81.9/82.0 kB 1.5 MB/s eta 0:00:01
   ---------------------------------------  81.9/82.0 kB 1.5 MB/s eta 0:00:01
   ---------------------------------------- 82.0/82.0 kB 574.1 kB/s eta 0:00:00
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.4


In [23]:
import category_encoders as ce

In [24]:
# encode categorical variables with ordinal encoding
encoder = ce.OrdinalEncoder(cols=['CustomerId', 'CreditScore', 'Age', 'Balance', 'NumOfProducts', 'HasCrCard', 'EstimatedSalary'])
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [25]:
X_train.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
8371,8372,1,Ngozichukwuka,1,Germany,Female,1,5,1,1,1,0,1
5027,5028,2,Wall,2,Germany,Male,2,8,2,2,2,1,2
9234,9235,3,Cameron,3,Germany,Male,3,4,3,1,2,0,3
3944,3945,4,Golubev,4,France,Female,4,8,4,1,2,1,4
6862,6863,5,Piccio,5,France,Male,5,9,5,1,2,0,5


In [26]:
X_test.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
6252,6253,-1.0,Anderson,382.0,Germany,Male,39.0,3,-1.0,2,1,0,-1.0
4684,4685,-1.0,Herring,366.0,France,Male,15.0,1,5.0,2,2,1,-1.0
1731,1732,-1.0,Amechi,85.0,Spain,Female,29.0,4,5.0,2,2,0,-1.0
4742,4743,-1.0,Liang,395.0,Germany,Male,26.0,8,-1.0,2,2,1,-1.0
4521,4522,-1.0,Chuang,238.0,Spain,Female,9.0,7,-1.0,1,2,1,-1.0


In [29]:
# Check which columns have non-numeric data types
print(X_train.select_dtypes(include=['object']).columns)


Index(['Surname', 'Geography', 'Gender'], dtype='object')


In [30]:
# Apply one-hot encoding to categorical features
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Ensure that both X_train and X_test have the same columns after encoding
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)


In [31]:
# import Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
# instantiate the classifier 
rfc = RandomForestClassifier(random_state=60)
# fit the model
rfc.fit(X_train, y_train)

In [32]:
y_pred = rfc.predict(X_test)

In [33]:
# Check accuracy score 

from sklearn.metrics import accuracy_score

print('Model accuracy score with 10 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score with 10 decision-trees : 0.8048
