In [None]:
# Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
# Importing the dataset
data = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

# list of first five rows
data.head()

In [None]:
# check for missing values
data.isnull().sum()

In [None]:
data.info()

In [None]:
# shape
data.shape

In [None]:
data = data.drop(["customerID"], axis = 1)
data.head()

In [None]:
data[data["TotalCharges"]==" "]

In [None]:
# converting datatype of TotalCharges into numeric
data['TotalCharges'] = pd.to_numeric(data.TotalCharges, errors='coerce')
data.isnull().sum()

In [None]:
# replacing the missing values in TotalCharges with the mean
data["TotalCharges"].fillna(data["TotalCharges"].mean(),inplace=True)

In [None]:
data[data["tenure"] == 0]

In [None]:
# removing the rows where tenure = 0
data.drop(labels=data[data["tenure"] == 0].index, axis = 0, inplace = True)

EDA

In [None]:
data["Churn"].value_counts().plot(kind="pie",autopct='%1.0f%%')

In [None]:
# relation between gender and churn
x=pd.crosstab(data["gender"],data["Churn"])
x

In [None]:
x.plot(kind="bar")

In [None]:
# relation between contract and churn
pd.crosstab(data["Contract"],data["Churn"],normalize="index")

Customers with monthly churn are more likely to churn

In [None]:
data['PaymentMethod'].value_counts().plot(kind="bar")

In [None]:
pd.crosstab(data[data["gender"]=="Male"]["InternetService"],data[data["gender"]=="Male"]["Churn"])

In [None]:
pd.crosstab(data[data["gender"]=="Female"]["InternetService"],data[data["gender"]=="Female"]["Churn"])

In [None]:
# relation between Dependents and churn
pd.crosstab(data["Dependents"],data["Churn"]).plot(kind="bar")

Customers without dependents are more likely to churn

In [None]:
# relation between Partner and churn
pd.crosstab(data["Partner"],data["Churn"]).plot(kind="bar")

In [None]:
# relation between SeniorCitizen and churn
pd.crosstab(data["SeniorCitizen"],data["Churn"]).plot(kind="bar")

In [None]:
# relation between Onlinesecurity and churn
pd.crosstab(data["Onlinesecurity"],data["Churn"]).plot(kind="bar")

In [None]:
# relation between PaperlessBilling and churn
pd.crosstab(data["PaperlessBilling"],data["Churn"]).plot(kind="bar")

Customers with paperless billing are more likely to churn.

In [None]:
# relation between TechSupport and churn
pd.crosstab(data["TechSupport"],data["Churn"]).plot(kind="bar")

Customers with no TechSupport are most likely to migrate to another service provider.

In [None]:
# relation between PhoneService and churn
pd.crosstab(data["PhoneService"],data["Churn"]).plot(kind="bar")

In [None]:
# relation between Tenure and churn
pd.crosstab(data["Tenure"],data["Churn"]).plot(kind="bar")

New customers are more likely to churn

In [None]:
data.isnull().sum()

Feature Engineering

In [None]:
# Label Encoding
data["gender"]=data["gender"].map({"Female":0,"Male":1})
data["Partner"]=data["Partner"].map({"No":0,"Yes":1})
data["Dependents"]=data["Dependents"].map({"No":0,"Yes":1})
data["PhoneService"]=data["PhoneService"].map({"No":0,"Yes":1})
data["PaperlessBilling"]=data["PaperlessBilling"].map({"No":0,"Yes":1})
data["Churn"]=data["Churn"].map({"No":0,"Yes":1})

In [None]:
# creating dummies
data=pd.get_dummies(data,drop_first=True)

Model Devolopment

In [None]:
# separating the dependent and independent variables
X = data.drop(columns = "Churn")
y = data["Churn"]

In [None]:
# splitting the data into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state =12)

Decision Trees

In [None]:
# using RandomizedsearchCV for hyperparameter tuning
params={"criterion":["gini", "entropy"],
"max_depth": [2,3,4,5,7,9],
"min_samples_split" :[5,10,15,20,50,100],
"min_samples_leaf" : [5,10,15,20,50,80,100]}

dtg=DecisionTreeClassifier(random_state=0)
r_search=RandomizedSearchCV(estimator=dtg,param_distributions=params , cv=10, n_jobs=2)
r_search.fit(X_train, y_train) 

In [None]:
r_search.best_estimator_

In [None]:
# training the model using best values of hyperparameters
dt=DecisionTreeClassifier(max_depth=9, min_samples_leaf=50, min_samples_split=20,
                       random_state=0)
dt.fit(X_train,y_train)

In [None]:
dt.score(X_train,y_train) # Train Accuracy

In [None]:
dt.score(X_test,y_test) # Test Accuracy

In [None]:
pred_test1=dt.predict(X_test) # prediction on test data

In [None]:
# Classification Report
print(metrics.classification_report(y_test,pred_test1))

In [None]:
def classification_eva(act, pred, probs):
    ac1=metrics.accuracy_score(act, pred)
    rc1=metrics.recall_score(act, pred)
    pc1=metrics.precision_score(act, pred)
    f1=metrics.f1_score(act, pred)
    auc1=metrics.roc_auc_score(act, pred)
    result={"Accuracy":ac1, "Recall":rc1, "Precision":pc1, "F1 score":f1, "AUC":auc1}
    fpr, tpr, threshold=metrics.roc_curve(act,probs )
    plt.plot([0,1],[0,1],'k--', label="AUC: "+str(auc1))
    plt.plot(fpr,tpr)
    plt.xlabel('fpr')
    plt.ylabel('tpr')
    plt.legend()
    plt.show()
    return result

In [None]:
prob_test_2=dt.predict_proba(X_test)[:,1]
## Model Evaluation
classification_eva(y_test,pred_test1,prob_test_2)

XGBoosting

In [None]:
from xgboost import XGBClassifier
import optuna
def objective_xg(trial):
    """Define the objective function"""

    params = {
        'booster': trial.suggest_categorical('booster', ['gbtree']),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.05),
        'n_estimators': trial.suggest_int('n_estimators', 300, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.3, 0.9),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        "seed" : trial.suggest_categorical('seed', [42]),
        'eval_metric': trial.suggest_categorical('eval_metric', ['auc']),
    }
    model_xgb = XGBClassifier(**params)
    model_xgb.fit(X_train, y_train)
    y_pred = model_xgb.predict_proba(X_test)[:,1]
    return roc_auc_score(y_test,y_pred)