In [1]:
import pandas as pd
import numpy as np

In [2]:
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


In [3]:
from sklearn.model_selection import train_test_split


In [4]:
df=pd.read_csv('Churn_Modelling.csv')

In [5]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [6]:
#they are of no use in predicting churn
df.drop(columns=['RowNumber','CustomerId','Surname'],inplace=True)

In [7]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [8]:
#X
X=df.iloc[:,:-1].values

In [9]:
X

array([[619, 'France', 'Female', ..., 1, 1, 101348.88],
       [608, 'Spain', 'Female', ..., 0, 1, 112542.58],
       [502, 'France', 'Female', ..., 1, 0, 113931.57],
       ...,
       [709, 'France', 'Female', ..., 0, 1, 42085.58],
       [772, 'Germany', 'Male', ..., 1, 0, 92888.52],
       [792, 'France', 'Female', ..., 1, 0, 38190.78]], dtype=object)

In [10]:
#Output
Y=df.iloc[:,-1].values

In [11]:
Y

array([1, 0, 1, ..., 1, 1, 0], dtype=int64)

In [12]:
le=LabelEncoder()
X[:,2]=le.fit_transform(X[:,2])

In [13]:
X

array([[619, 'France', 0, ..., 1, 1, 101348.88],
       [608, 'Spain', 0, ..., 0, 1, 112542.58],
       [502, 'France', 0, ..., 1, 0, 113931.57],
       ...,
       [709, 'France', 0, ..., 0, 1, 42085.58],
       [772, 'Germany', 1, ..., 1, 0, 92888.52],
       [792, 'France', 0, ..., 1, 0, 38190.78]], dtype=object)

In [14]:
#Converting Geography
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [15]:
X

array([[1.0, 0.0, 0.0, ..., 1, 1, 101348.88],
       [0.0, 0.0, 1.0, ..., 0, 1, 112542.58],
       [1.0, 0.0, 0.0, ..., 1, 0, 113931.57],
       ...,
       [1.0, 0.0, 0.0, ..., 0, 1, 42085.58],
       [0.0, 1.0, 0.0, ..., 1, 0, 92888.52],
       [1.0, 0.0, 0.0, ..., 1, 0, 38190.78]], dtype=object)

In [16]:
#Splitting the dataset into train and test
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=0)

In [17]:
#Scaling the data
sc=StandardScaler()

In [18]:
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

In [19]:
from sklearn.metrics import accuracy_score

def acc_score(test, pred):
    
    acc_ = accuracy_score(test, pred)
    return acc_


# Print the scores
def print_score(test, pred, model):

    print(f"**** Classifier: {model} ****")
    print(f"ACCURACY: {accuracy_score(test, pred)}")
   

# LOGISTIC_REGRESSION

In [20]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()

In [21]:
lr.fit(x_train,y_train)

In [22]:
y_pred=lr.predict(x_test)

In [23]:
print_score(y_test, y_pred, "Logistic")

**** Classifier: Logistic ****
ACCURACY: 0.8068


In [24]:
#putting the model in the list
model_list = []
acc_list = []

model_list.append(lr.__class__.__name__)
acc_list.append(round(acc_score(y_test, y_pred), 4))

# RANDOM_FOREST

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
rfc=RandomForestClassifier()

In [27]:
rfc.fit(x_train,y_train)

In [28]:
Y_pred=rfc.predict(x_test)

In [29]:
print_score(y_test, Y_pred, "RandomForest")

**** Classifier: RandomForest ****
ACCURACY: 0.8716


In [30]:
model_list.append(rfc.__class__.__name__)
acc_list.append(round(acc_score(y_test, Y_pred), 4))

# GRADIENT_BOOSTING

In [31]:
from sklearn.ensemble import GradientBoostingClassifier

In [32]:
grd=GradientBoostingClassifier()


In [33]:
grd.fit(x_train,y_train)

In [34]:
Y_PRED=grd.predict(x_test)

In [35]:
print_score(y_test, Y_PRED, "Gradient_Boosting")

**** Classifier: Gradient_Boosting ****
ACCURACY: 0.8676


In [36]:
model_list.append(grd.__class__.__name__)
acc_list.append(round(acc_score(y_test,Y_PRED), 4))

In [37]:
model_results = pd.DataFrame({"Model": model_list,
                              "Accuracy_Score": acc_list,
                              })

In [38]:
model_results

Unnamed: 0,Model,Accuracy_Score
0,LogisticRegression,0.8068
1,RandomForestClassifier,0.8716
2,GradientBoostingClassifier,0.8676


RANDOM FOREST performs the best here in predicting CHURN