<a href="https://colab.research.google.com/github/Darius2527/ChurnModelling-Classification/blob/main/ChurnModelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#LOADING THE DATASET
import pandas as pd
df=pd.read_csv('Churn_Modelling.csv')
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()
#no null values present

In [None]:
#lets drop unnecessary columns
df=df.drop(columns=['RowNumber','CustomerId','Surname'])
df.head()

In [None]:
#let us create dummy variables for Geography and Gender columns
df=pd.get_dummies(data=df,drop_first=True)
df.head()

In [None]:
df["Geography_Germany"] = df["Geography_Germany"].astype(int)
df["Geography_Spain"] = df["Geography_Spain"].astype(int)
df["Gender_Male"]=df["Gender_Male"].astype(int)
df.head()

In [None]:
df["Exited"].value_counts()
#we can see that the data is inbalanced

In [None]:
import seaborn as sns
sns.heatmap(df.corr())

In [None]:
#extracting features and target
X=df.drop(columns=["Exited"])
Y=df["Exited"]

In [None]:
X

In [None]:
Y

In [None]:
#splitting dataset into training and test data
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
#feature scaling
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
sc.fit_transform(X_train)
sc.transform(X_test)

In [None]:
model_accuracy={}
roc_score={}


In [None]:
#Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
parameters={'penalty':['l1','l2','elastic',None],
            'C':[0.01,0.05,0.1,0.5,1],
            'tol':[1e-5,1e-4,1e-3,1e-2],
            }
clf=GridSearchCV(LogisticRegression(),parameters,cv=5)
clf.fit(X_train,Y_train)

In [None]:
clf.cv_results_

In [None]:
#checking best parameter
best_params=clf.best_params_
best_params

In [None]:
#creating model with best parameters
model_LR=clf.best_estimator_
model_LR.get_params()

In [None]:
#predicting output for test data
Y_pred=model_LR.predict(X_test)

In [None]:
#checking performance metrics
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score
accuracy_LR=accuracy_score(Y_test,Y_pred)
rocscore_LR=roc_auc_score(Y_test,Y_pred)
model_accuracy["Logistic Regression"]=accuracy_LR
roc_score["Logistic Regression"]=rocscore_LR
print(accuracy_LR)
print(rocscore_LR)
print(confusion_matrix(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))


In [None]:
#SVM Classifier
from sklearn.svm import SVC
parameters={'C':[0.1,0.5,1],
            'kernel':['linear','poly','rbf']}
clf=GridSearchCV(SVC(),parameters,cv=5)
clf.fit(X_train,Y_train)


In [None]:
clf.cv_results_

In [None]:
best_params=clf.best_params_
best_params

In [None]:
model_SVC=clf.best_estimator_
model_SVC.get_params()

In [None]:
Y_pred=model_SVC.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score
accuracy_SVM=accuracy_score(Y_test,Y_pred)
rocscore_SVM=roc_auc_score(Y_test,Y_pred)
model_accuracy["SVM classifier"]=accuracy_SVM
roc_score["SVM classifier"]=rocscore_SVM
print(accuracy_SVM)
print(rocscore_SVM)
print(confusion_matrix(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))


In [None]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
parameters={'n_neighbors':[3,5,7,9,11],
            'p':[1,2],
            'leaf_size':[10,20,30],
            'weights':['uniform','auto'],}
clf=GridSearchCV(KNeighborsClassifier(),parameters,cv=5)
clf.fit(X_train,Y_train)

In [None]:
clf.cv_results_

In [None]:
best_params=clf.best_params_
best_params

In [None]:
model_KNN=clf.best_estimator_
model_KNN.get_params()

In [None]:
Y_pred=model_KNN.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score
accuracy_KNN=accuracy_score(Y_test,Y_pred)
rocscore_KNN=roc_auc_score(Y_test,Y_pred)
model_accuracy["KNN"]=accuracy_KNN
roc_score["KNN"]=rocscore_KNN
print(accuracy_KNN)
print(rocscore_KNN)
print(confusion_matrix(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))


In [None]:
#Naive Bayes Classifier
#no hyperparameter tuning as naive bayes does not have any hyperparameters to tune
#let us apply K fold cross validation for this model
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
NB=GaussianNB()
accuracy_NB=cross_val_score(NB,X,Y,cv=5,scoring='accuracy').mean()
rocscore_NB=cross_val_score(NB,X,Y,cv=5,scoring='roc_auc').mean()
model_accuracy["Naive Bayes Classifier"]=accuracy_NB
roc_score["Naive_Bayes_classifier"]=rocscore_NB
print(accuracy_NB)
print(rocscore_NB)

In [None]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
parameters={'n_estimators':[50,100,250,500],
            'criterion':['gini','entropy'],
            'max_depth':[2,3,4,None]}
clf=GridSearchCV(RandomForestClassifier(),parameters,cv=5)
clf.fit(X_train,Y_train)

In [None]:
clf.cv_results_

In [None]:
best_params=clf.best_params_
best_params

In [None]:
model_RF=clf.best_estimator_
model_RF.get_params()

In [None]:
Y_pred=model_RF.predict(X_test)

In [None]:
accuracy_RF=accuracy_score(Y_test,Y_pred)
rocscore_RF=roc_auc_score(Y_test,Y_pred)
model_accuracy["Random Forest"]=accuracy_RF
roc_score["Random Forest"]=rocscore_RF
print(accuracy_RF)
print(rocscore_RF)
print(confusion_matrix(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

In [None]:
#AdaBoost Classifier
from sklearn.ensemble import AdaBoostClassifier
parameters={'n_estimators':[50,100,250,500],
            'learning_rate':[0.1,0.25,0.5,0.75,1]}
clf=GridSearchCV(AdaBoostClassifier(),parameters,cv=5)
clf.fit(X_train,Y_train)

In [None]:
clf.cv_results_

In [None]:
best_params=clf.best_params_
best_params

In [None]:
model_adaboost=clf.best_estimator_
model_adaboost.get_params()

In [None]:
Y_pred=model_adaboost.predict(X_test)

In [None]:
accuracy_adaboost=accuracy_score(Y_test,Y_pred)
rocscore_adaboost=roc_auc_score(Y_test,Y_pred)
model_accuracy["AdaBoost"]=accuracy_adaboost
roc_score["AdaBoost"]=rocscore_adaboost
print(accuracy_adaboost)
print(rocscore_adaboost)
print(confusion_matrix(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
plt.bar(model_accuracy.keys(),model_accuracy.values(),label="accuracy",color='orange')
plt.xlabel("Models")
plt.ylabel("Accuracy")
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
plt.bar(roc_score.keys(),roc_score.values(),label="ROC score",color='orange')
plt.xlabel("Models")
plt.ylabel("Score")
plt.show()

In [None]:
#AdaBoost has highest accuracy
#Naive Bayes has highest ROC
#accuracy can be improved if we obtain a balanced dataset using sampling techniques
#accuracy can be improved by applying hyperparameter tuning across more alues for hyperparameters