In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed


import numpy as np
import pandas as pd



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
main_df = pd.read_csv("/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df = main_df.copy()
df.head()

* Importing the basic libraries required in this notebook

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)
import plotly.express as px
import plotly.graph_objects as go
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

Following are the list of algorithms that are used in this notebook.

| Algorithms      |
| ----------- |
| Logistic Regression      |
| Decision Tree   |
| Random Forest     |
| XGBoost    |
| KNeighbours      |
| SVM   |
| AdaBoost      |

### EDA - Exploratory Data Analysis

In [None]:
# Shape of dataframe

df.shape

In [None]:
# List of all columns

df.columns

In [None]:
# Basic Information about the dataframe

df.info()

In [None]:
# List of all features with number of unique values present in them

df.nunique()

In [None]:
# Statistical measure of dataset

df.describe()

In [None]:
# We are dropping CustomerID because it will not make any contribution in prediction.

df = df.drop('customerID', axis=1)
df.head()

In [None]:
# Here totalcharges are object types so we need to change it into numeric format
df['TotalCharges'].dtype

In [None]:
# Changing in numeric format

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors="coerce")

In [None]:
# Checking for null values

df.isnull().sum()

In [None]:
# Dropping all the rows in which value in not known

df.drop(df[df['TotalCharges'].isnull()].index, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.head()

In [None]:
df.shape

## Visualization

In [None]:
# Visualization of null values

sns.heatmap(df.isnull())

In [None]:
# Checking Correlation

df.corr()

In [None]:
# Visualizing correalation

sns.heatmap(df.corr(), annot=True)

In [None]:
plt.figure(figsize=(5, 5))
sns.barplot(data = df, y="TotalCharges", x="Churn")

In [None]:
sns.barplot(data = df, x="tenure", y="Churn")

In [None]:
px.scatter(df, y="TotalCharges", x="tenure")

In [None]:
diag = px.histogram(df, x="Churn", color="SeniorCitizen")
diag.update_layout(width=750, height=550)
diag.show()

In [None]:
diag = px.pie(df, values='TotalCharges', names='Churn', hole=0.5)
diag.show()

In [None]:
labels = df['MultipleLines'].unique()
values = df['MultipleLines'].value_counts()

# pull is given as a fraction of the pie radius
diag = go.Figure(data=[go.Pie(labels=labels, values=values, pull=[0, 0.1, 0.2])])
diag.show()

In [None]:
labels = df['InternetService'].unique()
values = df['InternetService'].value_counts()


diag = go.Figure(data=[go.Pie(labels=labels, values=values, pull=[0, 0.2, 0.3])])
diag.show()

In [None]:
labels = df['PaymentMethod'].unique()
values = df['PaymentMethod'].value_counts()


diag = go.Figure(data=[go.Pie(labels=labels, values=values, pull=[0, 0, 0.2, 0])])
diag.show()

In [None]:
labels = df['Contract'].unique()
values = df['Contract'].value_counts()


diag = go.Figure(data=[go.Pie(labels=labels, values=values, pull=[0, 0.2, 0.3])])
diag.show()

### Preprocessing Data

In [None]:
print (df['Partner'].value_counts(ascending=True))

In [None]:
for i in df.columns:
    if df[i].dtypes=="object":
        print(f'{i} : {df[i].unique()}')
        print("****************************************************")

* Above cell shows us the list of all features with their respective categorical variables

In [None]:
df.replace('No internet service', 'No', inplace=True)
df.replace('No phone service', 'No', inplace=True)

* Replace long negative text by "No" for ease of access

In [None]:
for i in df.columns:
    if df[i].dtypes=="object":
        print(f'{i} : {df[i].unique()}')
        print("****************************************************")

* After Removing all long values we have above dataframe

In [None]:
print(df['gender'].value_counts(ascending=True))

In [None]:
# Replaceing Male be 0 and Female by 1

df['gender'].replace({'Female':1,'Male':0},inplace=True)
df.head()

In [None]:
print(df['InternetService'].value_counts(ascending=True))

* There are few features in which categorical variables are more than two and they are not "Yes" or "No" types

In [None]:
for i in df.columns:
    if (len(df[i].unique()) >2) & (df[i].dtypes != "int64") &(df[i].dtypes!= "float64"):
        print(i)

* Above 3 Features column in dataset have more than 2 categorical values

In [None]:
print(df['InternetService'].value_counts(ascending=True))

In [None]:
print(df['Contract'].value_counts(ascending=True))

In [None]:
print(df['PaymentMethod'].value_counts(ascending=True))

In [None]:
len(df['PaymentMethod'].unique())

* To deal with such kind of feature column we are required to use one-hot encoding.

In [None]:
more_than_2 = ['InternetService' ,'Contract' ,'PaymentMethod']
df = pd.get_dummies(data=df, columns= more_than_2)
df.dtypes

In [None]:
df.shape

In [None]:
df.columns

In [None]:
for i in df.columns:
    if (df[i].dtypes == "int64")  | (df[i].dtypes== "float64"):
        print(i)

* Above feature column have numerical data so we will need to bring it into a particular range if they varies a lot

In [None]:
# Using MinMaxScaler of Feature Scaling

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

* Here we will consider gender and seniorcitizen because because their values already present in a small scale of 0 or 1
* So we will consider only "tenure", "MonthlyCharges", "TotalCharges"

In [None]:
large_cols = ["tenure", "MonthlyCharges", "TotalCharges"]
df[large_cols] = scaler.fit_transform(df[large_cols])
df[large_cols].head()

In [None]:
# After feature scaling we have following dataset

df.head()

In [None]:
for i in df.columns:
    if (df[i].dtypes == "object"):
        print(i)

* All the column present above have object type of dataset in them i.e here only "Yes" or "No"

In [None]:
two_cate = ['Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']
for i in two_cate:
    df[i].replace({"No":0, "Yes":1}, inplace=True)
df.head()

In [None]:
# Splitting Dataset into train and test set

X = df.drop('Churn', axis=1)
y = df['Churn']

In [None]:
X.shape, y.shape

In [None]:
# Performing Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

## Model Building

### Using Logistics Regression

In [None]:
# Importing Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
# creating object for model
model_lg = LogisticRegression(max_iter=120,random_state=0, n_jobs=20)

In [None]:
# Model Training

model_lg.fit(X_train, y_train)

In [None]:
# Making Predictions
pred_lg = model_lg.predict(X_test)

In [None]:
# Calculating Accuracy of the model

lg = round(accuracy_score(y_test, pred_lg)*100,2)
print(lg)

In [None]:
# Classification Report

print(classification_report(y_test, pred_lg))

In [None]:
# confusion Matrix

cm1 = confusion_matrix(y_test, pred_lg)
sns.heatmap(cm1/np.sum(cm1), annot=True, fmt='0.2%', cmap="Reds")
plt.title("Logistic Regression Confusion Matrix",fontsize=12)
plt.show()

### Using decision Tree Classifer

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Creating object of the model
model_dt = DecisionTreeClassifier(max_depth=4, random_state=42)

### Using Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Creating model object
model_rf = RandomForestClassifier(n_estimators=300,min_samples_leaf=0.16, random_state=42)

In [None]:
# Training Model
model_rf.fit(X_train, y_train)

In [None]:
# Making Prediction
pred_rf = model_rf.predict(X_test)

In [None]:
# Calculating Accuracy Score
rf = round(accuracy_score(y_test, pred_rf)*100, 2)
print(rf)

In [None]:
print(classification_report(y_test,pred_rf))

In [None]:
# confusion Maxtrix
cm3 = confusion_matrix(y_test, pred_rf)
sns.heatmap(cm3/np.sum(cm3), annot = True, fmt=  '0.2%', cmap = 'Reds')
plt.title("RandomForest Classifier Confusion Matrix",fontsize=12)
plt.show()

### Using XGBoost Classifier

In [None]:
from xgboost import XGBClassifier

In [None]:
# Creating model object
model_xgb = XGBClassifier(max_depth= 8, n_estimators= 125, random_state= 0,  learning_rate= 0.03, n_jobs=5)

In [None]:
# Training Model
model_xgb.fit(X_train, y_train)

In [None]:
# Making Prediction
pred_xgb = model_xgb.predict(X_test)

In [None]:
# Calculating Accuracy Score
xgb = round(accuracy_score(y_test, pred_xgb)*100, 2)
print(xgb)

In [None]:
print(classification_report(y_test,pred_xgb))

In [None]:
# confusion Maxtrix
cm4 = confusion_matrix(y_test, pred_xgb)
sns.heatmap(cm4/np.sum(cm4), annot = True, fmt=  '0.2%', cmap = 'Reds')
plt.title("XGBoost Classifier Confusion Matrix",fontsize=12)
plt.show()

### Using KNeighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Creating model object
model_kn = KNeighborsClassifier(n_neighbors=9, leaf_size=20)

In [None]:
# Training Model
model_kn.fit(X_train, y_train)

In [None]:
# Making Prediction
pred_kn = model_kn.predict(X_test)

In [None]:
# Calculating Accuracy Score
kn = round(accuracy_score(y_test, pred_kn)*100, 2)
print(kn)

In [None]:
print(classification_report(y_test,pred_kn))

In [None]:
# confusion Maxtrix
cm5 = confusion_matrix(y_test, pred_kn)
sns.heatmap(cm5/np.sum(cm5), annot = True, fmt=  '0.2%', cmap = 'Reds')
plt.title("KN Classifier Confusion Matrix",fontsize=12)
plt.show()

###  Using SVM

In [None]:
from sklearn.svm import SVC, LinearSVC

In [None]:
model_svm = SVC(kernel='rbf', random_state = 42)

In [None]:
model_svm.fit(X_train, y_train)

In [None]:
# Making Prediction
pred_svm = model_svm.predict(X_test)

In [None]:
# Calculating Accuracy Score
sv = round(accuracy_score(y_test, pred_svm)*100, 2)
print(sv)

In [None]:
print(classification_report(y_test,pred_kn))

In [None]:
# confusion Maxtrix
cm6 = confusion_matrix(y_test, pred_svm)
sns.heatmap(cm6/np.sum(cm6), annot = True, fmt=  '0.2%', cmap = 'Reds')
plt.title("SVM Classifier Confusion Matrix",fontsize=12)
plt.show()

### Using AdaBoost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
model_ada = AdaBoostClassifier(learning_rate= 0.002,n_estimators= 205,random_state=42)

In [None]:
model_ada.fit(X_train, y_train)

In [None]:
# Making Prediction
pred_ada = model_ada.predict(X_test)

In [None]:
# Calculating Accuracy Score
ada = round(accuracy_score(y_test, pred_ada)*100, 2)
print(ada)

In [None]:
print(classification_report(y_test,pred_ada))

In [None]:
# confusion Maxtrix
cm7 = confusion_matrix(y_test, pred_ada)
sns.heatmap(cm7/np.sum(cm7), annot = True, fmt=  '0.2%', cmap = 'Reds')
plt.title("Adaboost Classifier Confusion Matrix",fontsize=12)
plt.show()

### Handling Imbalance dataset Using SMOTE and apply Random Forest Algorithm

In [None]:
df.head()

In [None]:
oversample = SMOTE()

In [None]:
X1, y1 = oversample.fit_resample(X,y)

In [None]:
X1.shape, y1.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X1, y1, test_size=0.33, random_state=0)

In [None]:
model_rf_smt=RandomForestClassifier(criterion = "gini",random_state = 10,max_depth=10, min_samples_leaf=5)

In [None]:
model_rf_smt.fit(X_train,y_train)

In [None]:
pred_rf_smt = model_rf_smt.predict(X_test)

In [None]:
rf_smt  = round(accuracy_score(y_test, pred_rf_smt)*100, 2)
print(rf_smt)

In [None]:
print(classification_report(y_test, pred_rf_smt))

In [None]:
cm8 = confusion_matrix(y_test, pred_rf_smt)
sns.heatmap(cm8/np.sum(cm8), annot = True, fmt=  '0.2%', cmap = 'Reds')
plt.title("Random Forest (using SMOTE) Confusion Matrix ",fontsize=12)
plt.show()

In [None]:
models = pd.DataFrame({
    'Model':['Logistic Regression', 'Decision Tree', 'Random Forest', 'XGBoost', 'KNeighbours', 'SVM', 'AdaBoost', 'Random Forest \'SMOTE\''],
    'Accuracy_score' :[lg, dt, rf, xgb, kn, sv, ada, rf_smt]
})
models
sns.barplot(x='Accuracy_score', y='Model', data=models)

models.sort_values(by='Accuracy_score', ascending=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')