In [47]:
import pandas as pd
import io
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [48]:
df = pd.read_csv("Churn_Modelling.csv")
print(df)

      RowNumber  CustomerId    Surname  CreditScore Geography  Gender  Age  \
0             1    15634602   Hargrave          619    France  Female   42   
1             2    15647311       Hill          608     Spain  Female   41   
2             3    15619304       Onio          502    France  Female   42   
3             4    15701354       Boni          699    France  Female   39   
4             5    15737888   Mitchell          850     Spain  Female   43   
...         ...         ...        ...          ...       ...     ...  ...   
9995       9996    15606229   Obijiaku          771    France    Male   39   
9996       9997    15569892  Johnstone          516    France    Male   35   
9997       9998    15584532        Liu          709    France  Female   36   
9998       9999    15682355  Sabbatini          772   Germany    Male   42   
9999      10000    15628319     Walker          792    France  Female   28   

      Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMemb

In [49]:
# Finding Missing Values
print(df.isnull().sum())

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


In [50]:
#Check for Duplicates
df.duplicated().sum

<bound method Series.sum of 0       False
1       False
2       False
3       False
4       False
        ...  
9995    False
9996    False
9997    False
9998    False
9999    False
Length: 10000, dtype: bool>

In [51]:
#Detect Outliers
df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [52]:
df= df.drop(['Surname', 'Geography','Gender'], axis=1)
print(df)

      RowNumber  CustomerId  CreditScore  Age  Tenure    Balance  \
0             1    15634602          619   42       2       0.00   
1             2    15647311          608   41       1   83807.86   
2             3    15619304          502   42       8  159660.80   
3             4    15701354          699   39       1       0.00   
4             5    15737888          850   43       2  125510.82   
...         ...         ...          ...  ...     ...        ...   
9995       9996    15606229          771   39       5       0.00   
9996       9997    15569892          516   35      10   57369.61   
9997       9998    15584532          709   36       7       0.00   
9998       9999    15682355          772   42       3   75075.31   
9999      10000    15628319          792   28       4  130142.79   

      NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  Exited  
0                 1          1               1        101348.88       1  
1                 1          0     

In [53]:
#Normalize the dataset
scaler = MinMaxScaler()
df1 = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
print(df1)

      RowNumber  CustomerId  CreditScore       Age  Tenure   Balance  \
0        0.0000    0.275616        0.538  0.324324     0.2  0.000000   
1        0.0001    0.326454        0.516  0.310811     0.1  0.334031   
2        0.0002    0.214421        0.304  0.324324     0.8  0.636357   
3        0.0003    0.542636        0.698  0.283784     0.1  0.000000   
4        0.0004    0.688778        1.000  0.337838     0.2  0.500246   
...         ...         ...          ...       ...     ...       ...   
9995     0.9996    0.162119        0.842  0.283784     0.5  0.000000   
9996     0.9997    0.016765        0.332  0.229730     1.0  0.228657   
9997     0.9998    0.075327        0.718  0.243243     0.7  0.000000   
9998     0.9999    0.466637        0.844  0.324324     0.3  0.299226   
9999     1.0000    0.250483        0.884  0.135135     0.4  0.518708   

      NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  Exited  
0          0.000000        1.0             1.0         0.5067

In [54]:
#split the dataset into input and output
X = df1.drop('Exited', axis=1)
y = df1['Exited']
print(X,y)

      RowNumber  CustomerId  CreditScore       Age  Tenure   Balance  \
0        0.0000    0.275616        0.538  0.324324     0.2  0.000000   
1        0.0001    0.326454        0.516  0.310811     0.1  0.334031   
2        0.0002    0.214421        0.304  0.324324     0.8  0.636357   
3        0.0003    0.542636        0.698  0.283784     0.1  0.000000   
4        0.0004    0.688778        1.000  0.337838     0.2  0.500246   
...         ...         ...          ...       ...     ...       ...   
9995     0.9996    0.162119        0.842  0.283784     0.5  0.000000   
9996     0.9997    0.016765        0.332  0.229730     1.0  0.228657   
9997     0.9998    0.075327        0.718  0.243243     0.7  0.000000   
9998     0.9999    0.466637        0.844  0.324324     0.3  0.299226   
9999     1.0000    0.250483        0.884  0.135135     0.4  0.518708   

      NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  
0          0.000000        1.0             1.0         0.506735  
1  

In [55]:
#splitting the data for training & Testing
X_train ,X_test ,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [56]:
#Print the training data and testing data
print("X_train:\n", X_train)
print("X_test:\n", X_test)
print("y_train:\n", y_train)
print("y_test:\n", y_test)

X_train:
       RowNumber  CustomerId  CreditScore       Age  Tenure   Balance  \
6433   0.643364    0.034674        0.130  0.472973     0.5  0.666643   
815    0.081508    0.216038        0.790  0.094595     0.5  0.629710   
8692   0.869287    0.757781        1.000  0.189189     0.8  0.000000   
1752   0.175218    0.635088        0.788  0.310811     0.2  0.335249   
1730   0.173017    0.310962        1.000  0.027027     0.7  0.000000   
...         ...         ...          ...       ...     ...       ...   
911    0.091109    0.723188        0.596  0.472973     0.6  0.443213   
7319   0.731973    0.785411        0.248  0.256757     0.5  0.568711   
3729   0.372937    0.528027        0.760  0.148649     0.2  0.000000   
2157   0.215722    0.739825        0.788  0.351351     0.3  0.000000   
980    0.098010    0.236122        0.488  0.121622     0.0  0.785346   

      NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  
6433       0.000000        1.0             1.0         0.11