#**1. Import Drive:**

In [None]:
from google.colab import drive

In [None]:
#drive mount:
drive.mount('/content/drive/')

# **2. Import Libraries**

In [None]:
#importing Basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score

import warnings
warnings.filterwarnings("ignore")


Read data set using Pandas

In [None]:
data = pd.read_csv("customer_data.csv")

# **3. Exploratory Data Analysis**

In [None]:
#data head
data.head(5)

In [None]:
data.tail(10)

In [None]:
print("Shape:",data.shape)
print("Size: ",data.size)

In [None]:
data.size

In [None]:
data.columns

In [None]:
len(data.columns)

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
#unique values of each columns
objects_cols=['year', 'customer_id',  'gender', 'age',
       'no_of_days_subscribed', 'multi_screen', 'mail_subscribed',
       'weekly_mins_watched', 'minimum_daily_mins', 'maximum_daily_mins',
       'weekly_max_night_mins', 'videos_watched', 'maximum_days_inactive',
       'customer_support_calls', 'churn']
for i in objects_cols:
  print("column name:{}".format(i))
  print("Number of unique columns of",i,":{}".format(data[i].nunique()))
  print("Values of Unique columns of ",i,"is below:\n{}".format(data[i].value_counts()))
  print("------------------------------------------------------------------------")

In [None]:
  data.isnull()

In [None]:
data.isnull().sum()

# **4.Data Pre-processing**

**4.1. Missing Values**

In [None]:
#Null values processed by filling the values
data["gender"].fillna("No Gender", inplace = True)
#Null values processed using MODE
mode = data['maximum_days_inactive'].mode().values[0]
data['maximum_days_inactive']= data['maximum_days_inactive'].replace(np.nan, mode)
#Null value processed by deleting the rows
data = data.dropna(axis = 0, how ='any')


In [None]:
print(data.isnull().sum())
data.shape

**4.2. Label Encoding**

3.3.1 Import Libraries

In [None]:
from sklearn import preprocessing

In [None]:
print(data['gender'].unique())
print(data['mail_subscribed'].unique())
data['multi_screen'].unique()

In [None]:
label_encoder = preprocessing.LabelEncoder()
data['gender']= label_encoder.fit_transform(data['gender'])
data['multi_screen']= label_encoder.fit_transform(data['multi_screen'])
data['mail_subscribed']= label_encoder.fit_transform(data['mail_subscribed'])
print(data['gender'].unique())
print(data['mail_subscribed'].unique())
data['multi_screen'].unique()

In [None]:
#display encoded dataset
data.head(10)

Drop column 'phone_no' as it is type object with all unique value count


# **5. Feature Selection**

**5.1 ANOVA F-score method**

In [None]:
df=pd.DataFrame(data)

In [None]:
X_temp = df.drop(labels=["churn"],axis=1) # Features
y_temp = df["churn"] # Target

In [None]:
from sklearn.feature_selection import f_regression, SelectKBest
# Applying SelectKBest class to extract top 13 best features
fs = SelectKBest(score_func=f_regression,k=13)
# Applying feature selection
fit = fs.fit(X_temp,y_temp)

In [None]:
features_score = pd.DataFrame(fit.scores_)
features = pd.DataFrame(X_temp.columns)
feature_score = pd.concat([features,features_score],axis=1)
# Assigning column names
feature_score.columns = ["Input_Features","F_Score"]
print(feature_score.nlargest(12,columns="F_Score"))

**Features Selected:**  
'multi_screen',
'customer_support_calls',   'weekly_mins_watched',
'maximum_daily_mins',
'mail_subscribed',
'minimum_daily_mins',
'customer_id',
'maximum_days_inactive'

##PCA

In [None]:
df_p = df[['multi_screen',
'customer_support_calls',   'weekly_mins_watched',
'maximum_daily_mins',
'mail_subscribed',
'minimum_daily_mins',
'customer_id',
'maximum_days_inactive']]

In [None]:
df_p

In [None]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
df_scaled = pd.DataFrame(scalar.fit_transform(df_p), columns=df_p.columns)
df_scaled

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
df_pca = pd.DataFrame(pca.fit_transform(df_scaled))
df_pca

# **6.train_test_split**

**6.1 Import Libraries**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
#X=df[['multi_screen', 'customer_support_calls', 'weekly_mins_watched', 'maximum_daily_mins', 'mail_subscribed', 'minimum_daily_mins', 'maximum_days_inactive']]
X=df_pca
y=df['churn']
# using the train test split function
X_train, X_test,y_train, y_test = train_test_split(X,y ,random_state=42,test_size=0.2,shuffle=True)

In [None]:
df.shape

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_train.head(2)

In [None]:
y_train.head(2)

# **7.Balance Data**

**SMOTE**

In [None]:

from imblearn.over_sampling import SMOTE
from collections import Counter

In [None]:
counter=Counter(y_train)
print('Before',counter)
#oversampling the train dataset using smote
smt=SMOTE()
X_train_sm,y_train_sm=smt.fit_resample(X_train,y_train)

counter=Counter(y_train_sm)
print('After',counter)

# **8.Model Implementation and Evaluation**

**8.1 RANDOM FOREST CLASSIFIER**

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_RFC = RandomForestClassifier(n_estimators=70)
#n_estimators(=number of trees you want to build before taking the max votings)
#its value can be changed from 10 to 100, higher number of trees give better performance
model_RFC.fit(X_train_sm , y_train_sm)

In [None]:
##Accuracy
model_RFC.score(X_test,y_test)

In [None]:
#testing the model
y_predicted_RFC = model_RFC.predict(X_test)

**8.1.1. Confusion Matrix**

In [None]:
from sklearn.metrics import confusion_matrix
cm_RFC = confusion_matrix(y_test, y_predicted_RFC)
##plotting the confusion matrix

fig, ax = plt.subplots(figsize=(2,2))
ax.matshow(cm_RFC, cmap=plt.cm.Blues, alpha=0.3)
for i in range(cm_RFC.shape[0]):
    for j in range(cm_RFC.shape[1]):
        ax.text(x=j, y=i,s=cm_RFC[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=12)
plt.ylabel('Actuals', fontsize=12)
plt.title('Confusion Matrix for RandomForest Classifier', fontsize=16)
plt.show()

**8.1.2. Classification Metrics**

In [None]:
classification_metrics_rfc= {
        "Accuracy": accuracy_score(y_test, y_predicted_RFC),
        "Precision": precision_score(y_test, y_predicted_RFC,average='weighted'),
        "Recall": recall_score(y_test, y_predicted_RFC,average='weighted'),
        "F1-score":f1_score(y_test, y_predicted_RFC,average='weighted')
    }

classification_metrics_rfc

In [None]:
##classification report
report_rfc = classification_report(y_test, y_predicted_RFC, output_dict=True)
df_classification_report_RFC = pd.DataFrame(report_rfc).transpose()
print('clasification report:\n')
df_classification_report_RFC

**8.1.3. ROC-CURVE**

In [None]:
from sklearn.metrics import roc_curve,auc
fpr,tpr,_=roc_curve(y_test,y_predicted_RFC)
roc_auc=auc(fpr,tpr)
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for RandomForest Classifier')
plt.legend(loc="lower right")
plt.show()

**8.1.4. Precision Recall Curve**

In [None]:
#P-R curve
from sklearn.metrics import precision_recall_curve
from matplotlib import pyplot
rfc_probs = model_RFC.predict_proba(X_test)
# keep probabilities for the positive outcome only
rfc_probs = rfc_probs[:, 1]
rfc_precision, rfc_recall, _ = precision_recall_curve(y_test, rfc_probs)
# plot the precision-recall curves
no_skill = len(y_test[y_test==1]) / len(y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='Baseline')
pyplot.plot(rfc_recall, rfc_precision, marker='.', label='Random Forest')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

**8.2 XGBOOST CLASSIFIER**

In [None]:



#import libraries
from xgboost import XGBClassifier
#Create model instance
bst=XGBClassifier(n_estimators=70,learning_rate=1,objective='binary:logistic')
#fit the model
bst.fit(X_train_sm,y_train_sm)

In [None]:
#Accuracy
bst.score(X_test,y_test)

In [None]:
#testing the model
y_predicted_XGB=bst.predict(X_test)

**8.2.1. Confusion Matrix**

In [None]:
cm_XGB = confusion_matrix(y_test, y_predicted_XGB)
##plotting the confusion matrix

fig, ax = plt.subplots(figsize=(2,2))
ax.matshow(cm_XGB, cmap=plt.cm.Blues, alpha=0.3)
for i in range(cm_XGB.shape[0]):
    for j in range(cm_XGB.shape[1]):
        ax.text(x=j, y=i,s=cm_XGB[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=12)
plt.ylabel('Actuals', fontsize=12)
plt.title('Confusion Matrix For XGBoost Classifier', fontsize=16)
plt.show()

**8.2.2. Classification Metrics**

In [None]:
classification_metrics_XGB= {
        "Accuracy": accuracy_score(y_test, y_predicted_XGB),
        "Precision": precision_score(y_test, y_predicted_XGB,average='weighted'),
        "Recall": recall_score(y_test, y_predicted_XGB,average='weighted'),
        "F1-score":f1_score(y_test, y_predicted_XGB,average='weighted')
    }

classification_metrics_XGB

In [None]:
##classification report
report_xgb = classification_report(y_test, y_predicted_XGB, output_dict=True)
df_classification_report_XGB= pd.DataFrame(report_xgb).transpose()
print('clasification report:\n')
df_classification_report_XGB

**8.2.3. ROC-CURVE**

In [None]:
fpr,tpr,_=roc_curve(y_test,y_predicted_XGB)
roc_auc=auc(fpr,tpr)
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for XGBoost Classifier')
plt.legend(loc="lower right")
plt.show()

**8.2.4. Precision Recall Curve**

In [None]:
#P-R curve
XGB_probs = bst.predict_proba(X_test)
# keep probabilities for the positive outcome only
XGB_probs = XGB_probs[:, 1]
XGB_precision, XGB_recall, _ = precision_recall_curve(y_test, XGB_probs)
# plot the precision-recall curves
no_skill = len(y_test[y_test==1]) / len(y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='Baseline')
pyplot.plot(XGB_recall, XGB_precision, marker='.', label='XGBOOST CLASSIFIER')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

**8.3 K-NEAREST NEIGHBOR (KNN) CLASSIFIER**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
#chose neighbours to be equal to 3
knn = KNeighborsClassifier(n_neighbors=3)
#fitting the model
knn.fit(X_train_sm, y_train_sm)

In [None]:
#Accuracy
knn.score(X_test,y_test)

In [None]:
#testing the model
y_predicted_knn = knn.predict(X_test)

**8.3.1. Confusion Matrix**

In [None]:
from sklearn.metrics import confusion_matrix
cm_knn = confusion_matrix(y_test, y_predicted_knn)
##plotting the confusion matrix

fig, ax = plt.subplots(figsize=(2,2))
ax.matshow(cm_knn, cmap=plt.cm.Blues, alpha=0.3)
for i in range(cm_knn.shape[0]):
    for j in range(cm_knn.shape[1]):
        ax.text(x=j, y=i,s=cm_knn[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=12)
plt.ylabel('Actuals', fontsize=12)
plt.title('Confusion Matrix for KNN Classifier', fontsize=16)
plt.show()

**8.3.2. Classification Metrics**

In [None]:
classification_metrics_knn= {
        "Accuracy": accuracy_score(y_test, y_predicted_knn),
        "Precision": precision_score(y_test, y_predicted_knn,average='weighted'),
        "Recall": recall_score(y_test, y_predicted_knn,average='weighted'),
        "F1-score":f1_score(y_test, y_predicted_knn,average='weighted')
    }
classification_metrics_knn

In [None]:
##classification report
report_knn = classification_report(y_test, y_predicted_knn, output_dict=True)
df_classification_report_knn = pd.DataFrame(report_knn).transpose()
print('clasification report:\n')
df_classification_report_knn

**8.3.3. ROC-CURVE**

In [None]:
fpr,tpr,_=roc_curve(y_test,y_predicted_knn)
roc_auc=auc(fpr,tpr)
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for KNN Classifier')
plt.legend(loc="lower right")
plt.show()

**8.3.4. Precision Recall Curve**

In [None]:
#P-R curve
knn_probs = knn.predict_proba(X_test)
# keep probabilities for the positive outcome only
knn_probs = knn_probs[:, 1]
knn_precision, knn_recall, _ = precision_recall_curve(y_test, knn_probs)
# plot the precision-recall curves
no_skill = len(y_test[y_test==1]) / len(y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='Baseline')
pyplot.plot(knn_recall, knn_precision, marker='.', label='KNEIGHBORS CLASSIFIER')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

**8.4 SUPPORT VECTOR MACHINE (SVM) CLASSIFIER**

In [None]:
from sklearn import svm
#Create model instance
clf = svm.SVC(kernel='linear',probability=True)
#fit the model
clf.fit(X_train_sm, y_train_sm)

In [None]:
#Accuracy
clf.score(X_test,y_test)

In [None]:
#testing the model
y_predicted_svm = clf.predict(X_test)

**8.4.1. Confusion Matrix**

In [None]:
from sklearn.metrics import confusion_matrix
cm_svm = confusion_matrix(y_test, y_predicted_svm)
##plotting the confusion matrix

fig, ax = plt.subplots(figsize=(2,2))
ax.matshow(cm_svm, cmap=plt.cm.Blues, alpha=0.3)
for i in range(cm_svm.shape[0]):
    for j in range(cm_svm.shape[1]):
        ax.text(x=j, y=i,s=cm_svm[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=12)
plt.ylabel('Actuals', fontsize=12)
plt.title('Confusion Matrix for SVM Classifier', fontsize=16)
plt.show()

**8.4.2. Classification Metrics**

In [None]:
classification_metrics_svm= {
        "Accuracy": accuracy_score(y_test, y_predicted_svm),
        "Precision": precision_score(y_test, y_predicted_svm,average='weighted'),
        "Recall": recall_score(y_test, y_predicted_svm,average='weighted'),
        "F1-score":f1_score(y_test, y_predicted_svm,average='weighted')
    }
classification_metrics_svm

In [None]:
##classification report
report_svm = classification_report(y_test, y_predicted_svm, output_dict=True)
df_classification_report_svm=pd.DataFrame(report_svm).transpose()
print('clasification report:\n')
df_classification_report_svm

**8.4.3. ROC-CURVE**

In [None]:
fpr,tpr,_=roc_curve(y_test,y_predicted_svm)
roc_auc=auc(fpr,tpr)
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for SVM Classifier')
plt.legend(loc="lower right")
plt.show()

**8.4.4. Precision Recall Curve**

In [None]:
#P-R curve
clf_probs = clf.predict_proba(X_test)
# keep probabilities for the positive outcome only
clf_probs = clf_probs[:, 1]
clf_precision, clf_recall, _ = precision_recall_curve(y_test, clf_probs)
# plot the precision-recall curves
no_skill = len(y_test[y_test==1]) / len(y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='Baseline')
pyplot.plot(clf_recall, clf_precision, marker='.', label='SVM')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

## **9. Comparison Of Models**

**Creating dictionary of classification metrics of each Model**

In [None]:
classifiers_m= {
    "RandomForest Classifier": classification_metrics_rfc,
    "XGBoost Classifier": classification_metrics_XGB,
    "K-Nearest Neighbors": classification_metrics_knn,
    "Support Vector Machine": classification_metrics_svm
}


In [None]:
classm_df = pd.DataFrame.from_dict(classifiers_m, orient='index')
classm_df

In [None]:

import plotly.express as px
fig = px.bar(classm_df,title="Comparison of Algorithms",barmode='group')
fig.show()

**Calibration Plot RFC vs XGB**

In [None]:
import scikitplot as skplt
y_rfc_proba = model_RFC.predict_proba(X_test)
y_XGB_proba = bst.predict_proba(X_test)

## RandomForest Classifier is the best fit Model.

## **10. Random Forest Classifier Result Evaluation**

**Normalized Confusion Matrix**

In [None]:
skplt.metrics.plot_confusion_matrix(y_test, y_predicted_RFC, normalize=True, title = 'Confusion Matrix for RFC')

**ROC-CURVE**

In [None]:
skplt.metrics.plot_roc(y_test, y_rfc_proba, title = 'ROC Plot for RFC')

**Precision Recall Curve (PR Curve)**

In [None]:
skplt.metrics.plot_precision_recall(y_test, y_rfc_proba, title = 'PR Curve for RFC')

**Cumulative Gains Curve And Lift Curve**

In [None]:
fig, ax = plt.subplots(1,2)
skplt.metrics.plot_cumulative_gain(y_test, y_rfc_proba, ax = ax[0], title = 'Cumulative Gains Chart for RFC')
skplt.metrics.plot_lift_curve(y_test, y_rfc_proba, ax = ax[1],  title = 'Lift Curve for RFC')
plt.show()