<a href="https://www.kaggle.com/code/edifonjimmy/machine-learning-explainability?scriptVersionId=198210680" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("ggplot")
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import recall_score, confusion_matrix, precision_score, f1_score, accuracy_score, classification_report

In [None]:
# Load data
df = pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:
df.head()

**The data set includes information about:**
* **Customers who left within the last month** – the column is called Churn

* **Services that each customer has signed up for** – phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies

* **Customer account information** - how long they’ve been a customer, contract, payment method, paperless billing, monthly charges, and total charges

* **Demographic info about customers** – gender, age range, and if they have partners and dependents

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.columns.values

In [None]:
df.dtypes

<a id = "7" ></a>
# <span style="font-family:serif; font-size:28px;">Visualize missing values </span>
<a id = "missingvalue" ></a>

In [None]:
# Visualize missing values as a matrix
msno.matrix(df);

> Using this matrix we can very quickly find the pattern of missingness in the dataset. 
* From the above visualisation we can observe that it has no peculiar pattern that stands out. In fact there is no missing data.

***

<a id = "8" ></a>
# <span style="font-family:serif; font-size:28px;">Data Manipulation </span>
<a id = "8" ></a>

In [None]:
df = df.drop(['customerID'], axis = 1)
df.head()

* On deep analysis, we can find some indirect missingness in our data (which can be in form of blankspaces). Let's see that!

In [None]:
df['TotalCharges'] = pd.to_numeric(df.TotalCharges, errors='coerce')
df.isnull().sum()

* Here we see that the TotalCharges has 11 missing values. Let's check this data.

In [None]:
df[np.isnan(df['TotalCharges'])]

* It can also be noted that the Tenure column is 0 for these entries even though the MonthlyCharges column is not empty.

Let's see if there are any other 0 values in the tenure column.

In [None]:
df[df['tenure'] == 0].index

* There are no additional missing values in the Tenure column. 

Let's delete the rows with missing values in Tenure columns since there are only 11 rows and deleting them will not affect the data.

In [None]:
df.drop(labels=df[df['tenure'] == 0].index, axis=0, inplace=True)
df[df['tenure'] == 0].index

> To solve the problem of missing values in TotalCharges column, I decided to fill it with the mean of TotalCharges values.

In [None]:
df.fillna(df["TotalCharges"].mean())

In [None]:
df.isnull().sum()

In [None]:
df["SeniorCitizen"]= df["SeniorCitizen"].map({0: "No", 1: "Yes"})
df.head()

In [None]:
df["InternetService"].describe(include=['object', 'bool'])

In [None]:
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numerical_cols].describe()

___

<a id = "9" ></a>
# <span style="font-family:serif; font-size:28px;">Data Visualization </span>
<a id = "datavisualization" ></a>

In [None]:
# Data for gender and churn
gender_counts = df['gender'].value_counts()
churn_counts = df['Churn'].value_counts()

# Create a figure with two subplots (1 row, 2 columns)
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

# Gender pie (donut chart)
axes[0].pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=90, 
            colors=['#FF9999', '#66B3FF'], wedgeprops=dict(width=0.4))
axes[0].set_title('Gender')

# Churn pie (donut chart)
axes[1].pie(churn_counts, labels=churn_counts.index, autopct='%1.1f%%', startangle=90, 
            colors=['#AB63FA', '#FF97FF'], wedgeprops=dict(width=0.4))
axes[1].set_title('Churn')

# Adjust the layout
plt.suptitle('Gender and Churn Distributions', fontsize=16)
plt.tight_layout()
plt.show()

* 26.6 % of customers switched to another firm.
* Customers are 49.5 % female and 50.5 % male.

In [None]:
df["Churn"][df["Churn"]=="No"].groupby(by=df["gender"]).count()

In [None]:
df["Churn"][df["Churn"]=="Yes"].groupby(by=df["gender"]).count()

In [None]:
plt.figure(figsize=(15, 6))
labels =["Churn: Yes","Churn:No"]
values = [1869,5163]
labels_gender = ["F","M","F","M"]
sizes_gender = [939,930 , 2544,2619]
colors = ['#ff6666', '#66b3ff']
colors_gender = ['#c2c2f0','#ffb3e6', '#c2c2f0','#ffb3e6']
explode = (0.3,0.3) 
explode_gender = (0.1,0.1,0.1,0.1)
textprops = {"fontsize":15}
#Plot
plt.pie(values, 
        labels=labels,
        autopct='%1.1f%%',
        pctdistance=1.08,
        labeldistance=0.8,
        colors=colors, 
        startangle=90,
        frame=True, 
        explode=explode,
        radius=10, 
        textprops =textprops, 
        counterclock = True, )

plt.pie(sizes_gender,
        labels=labels_gender,
        colors=colors_gender,
        startangle=90,
        explode=explode_gender,
        radius=7, 
        textprops =textprops, 
        counterclock = True, )
#Draw circle
centre_circle = plt.Circle((0,0),5,color='black', fc='white',linewidth=0)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.title('Churn Distribution w.r.t Gender: Male(M), Female(F)', fontsize=15, y=1.1)

# show plot 
 
plt.axis('equal')
plt.tight_layout()
plt.show()

* There is negligible difference in customer percentage/ count who chnaged the service provider. Both genders behaved in similar fashion when it comes to migrating to another service provider/firm.

In [None]:
plt.figure(figsize=(15, 6))
sns.histplot(df, x="Churn")
plt.title("Customer contract distribution")
plt.show()

* About 75% of customer with Month-to-Month Contract opted to move out as compared to 13% of customrs with One Year Contract and 3% with Two Year Contract

In [None]:
payment_counts = df['PaymentMethod'].value_counts()

plt.figure(figsize=(15, 6))
sns.barplot(x=payment_counts.index, y=payment_counts.values)
plt.title('Payment Method Distribution')
plt.xlabel('Payment Method')
plt.ylabel('Count')
plt.show()

In [None]:
# Create a histogram plot
plt.figure(figsize=(15, 6))
sns.histplot(data=df, x="Churn", hue="PaymentMethod", multiple="stack")

# Add title and adjust layout
plt.title('Customer Payment Method distribution w.r.t. Churn', fontsize=14)
plt.xlabel('Churn')
plt.ylabel('Count')

# Show the plot
plt.tight_layout()
plt.show()

* Major customers who moved out were having Electronic Check as Payment Method.
* Customers who opted for Credit-Card automatic transfer or Bank Automatic Transfer and Mailed Check as Payment Method were less likely to move out.  

In [None]:
df["InternetService"].unique()

In [None]:
 df[df["gender"]=="Male"][["InternetService", "Churn"]].value_counts()

In [None]:
df[df["gender"]=="Female"][["InternetService", "Churn"]].value_counts()

In [None]:
data = {
    'Churn': ['No', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes'],
    'Gender': ['Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male'],
    'Count': [965, 992, 219, 240, 889, 910, 664, 633, 690, 717, 56, 57],
    'Internet Service': ['DSL', 'DSL', 'DSL', 'DSL', 'Fiber optic', 'Fiber optic', 'Fiber optic', 'Fiber optic', 'No Internet', 'No Internet', 'No Internet', 'No Internet']
}

vis_df = pd.DataFrame(data)

# Create a grouped barplot using Seaborn
plt.figure(figsize=(10, 6))
sns.barplot(x='Churn', y='Count', hue='Internet Service', ci=None, data=vis_df, 
            hue_order=['DSL', 'Fiber optic', 'No Internet'], 
            palette='Set2', dodge=True)

# Facet the bars by Gender
g = sns.catplot(data=vis_df, x='Churn', y='Count', hue='Internet Service', col='Gender',
                kind='bar', height=5, aspect=1.2, palette='Set2')

# Add title and show the plot
plt.subplots_adjust(top=0.85)
g.fig.suptitle('Churn Distribution w.r.t. Internet Service and Gender')
plt.show()

* A lot of customers choose the Fiber optic service and it's also evident that the customers who use Fiber optic have high churn rate, this might suggest a dissatisfaction with this type of internet service.
* Customers having DSL service are majority in number and have less churn rate compared to Fibre optic service.

In [None]:
df

In [None]:
# Define the color palette for the "Yes" and "No" categories of "Dependents"
palette = {"Yes": "#FF97FF", "No": "#AB63FA"}

# Create a grouped histogram using Seaborn
plt.figure(figsize=(15, 6))
sns.histplot(data=df, x="Churn", hue="Dependents", multiple="dodge", palette=palette)

# Add a title and adjust the layout
plt.title('Dependents distribution', fontsize=14)
plt.xlabel('Churn')
plt.ylabel('Count')

# Display the plot
plt.tight_layout()
plt.show()

* Customers without dependents are more likely to churn

In [None]:
# Define the color palette for the "Yes" and "No" categories of "Partner"
palette = {"Yes": '#FFA15A', "No": '#00CC96'}

# Create a grouped histogram using Seaborn
plt.figure(figsize=(15, 6))
sns.histplot(data=df, x="Churn", hue="Partner", multiple="dodge", palette=palette)

# Add a title and adjust the layout
plt.title('Churn distribution w.r.t. Partners', fontsize=14)
plt.xlabel('Churn')
plt.ylabel('Count')

# Display the plot
plt.tight_layout()
plt.show()

* Customers that doesn't have partners are more likely to churn

In [None]:
# Define the color palette for SeniorCitizen (Yes or No)
palette = {"Yes": '#00CC96', "No": '#B6E880'}

# Create a grouped histogram using Seaborn
plt.figure(figsize=(15, 6))
sns.histplot(data=df, x="Churn", hue="SeniorCitizen", multiple="dodge", palette=palette)

# Add a title and adjust the layout
plt.title('Churn distribution w.r.t. Senior Citizen', fontsize=14)
plt.xlabel('Churn')
plt.ylabel('Count')

# Display the plot
plt.tight_layout()
plt.show()

* It can be observed that the fraction of senior citizen is very less.
* Most of the senior citizens churn.

In [None]:
df

In [None]:
palette = {"Yes": "#FF97FF", "No": "#AB63FA", "No internet service": "#FFA07A"}

# Create a grouped histogram using Seaborn
plt.figure(figsize=(15, 6))
sns.histplot(data=df, x="Churn", hue="OnlineSecurity", multiple="dodge", palette=palette)

# Add a title and adjust the layout
plt.title('Churn w.r.t Online Security', fontsize=14)
plt.xlabel('Churn')
plt.ylabel('Count')

# Display the plot
plt.tight_layout()
plt.show()

* Most customers churn in the absence of online security, 

In [None]:
# Define the color palette for PaperlessBilling (Yes or No)
palette = {"Yes": '#FFA15A', "No": '#00CC96'}

# Create a grouped histogram using Seaborn
plt.figure(figsize=(15, 6))
sns.histplot(data=df, x="Churn", hue="PaperlessBilling", multiple="dodge", palette=palette)

# Add a title and adjust the layout
plt.title('Churn distribution w.r.t. Paperless Billing', fontsize=14)
plt.xlabel('Churn')
plt.ylabel('Count')

# Display the plot
plt.tight_layout()
plt.show()

* Customers with Paperless Billing are most likely to churn.

In [None]:
# Create a grouped histogram using Seaborn
plt.figure(figsize=(15, 6))
sns.histplot(data=df, x="Churn", hue="TechSupport", multiple="dodge")

# Add a title and adjust the layout
plt.title('Churn distribution w.r.t. TechSupport', fontsize=14)
plt.xlabel('Churn')
plt.ylabel('Count')

# Display the plot
plt.tight_layout()
plt.show()

* Customers with no TechSupport are most likely to migrate to another service provider.

In [None]:
# Define the color palette for PhoneService (Yes or No)
palette = {"Yes": '#00CC96', "No": '#B6E880'}

# Create a grouped histogram using Seaborn
plt.figure(figsize=(15, 6))
sns.histplot(data=df, x="Churn", hue="PhoneService", multiple="dodge", palette=palette)

# Add a title and adjust the layout
plt.title('Churn distribution w.r.t. Phone Service', fontsize=14)
plt.xlabel('Churn')
plt.ylabel('Count')

# Display the plot
plt.tight_layout()
plt.show()

* Very small fraction of customers don't have a phone service and out of that, 1/3rd Customers are more likely to churn.

In [None]:
plt.figure(figsize=(15, 6))
sns.set_context("paper",font_scale=1.1)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'No') ],
                color="Red", shade = True);
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'Yes') ],
                ax =ax, color="Blue", shade= True);
ax.legend(["Not Churn","Churn"],loc='upper right');
ax.set_ylabel('Density');
ax.set_xlabel('Monthly Charges');
ax.set_title('Distribution of monthly charges by churn');


* Customers with higher Monthly Charges are also more likely to churn

In [None]:
plt.figure(figsize=(15, 6))
ax = sns.kdeplot(df.TotalCharges[(df["Churn"] == 'No') ],
                color="Gold", shade = True);
ax = sns.kdeplot(df.TotalCharges[(df["Churn"] == 'Yes') ],
                ax =ax, color="Green", shade= True);
ax.legend(["Not Chu0rn","Churn"],loc='upper right');
ax.set_ylabel('Density');
ax.set_xlabel('Total Charges');
ax.set_title('Distribution of total charges by churn');

In [None]:
# Create a box plot using Seaborn
plt.figure(figsize=(15, 6))
sns.boxplot(data=df, x='Churn', y='tenure')

# Update axis titles
plt.xlabel('Churn')
plt.ylabel('Tenure (Months)')

# Set the title
plt.title('Tenure vs Churn', fontsize=25, fontfamily='Courier')

# Display the plot
plt.tight_layout()
plt.show()

* New customers are more likely to churn

In [None]:
plt.figure(figsize=(25, 10))

corr = df.apply(lambda x: pd.factorize(x)[0]).corr()

mask = np.triu(np.ones_like(corr, dtype=bool))

ax = sns.heatmap(corr, mask=mask, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, linewidths=.2, cmap='coolwarm', vmin=-1, vmax=1)

plt.title("Feature Correlation")

plt.show()

___

<a id = "10" ></a>
# <span style="font-family:serif; font-size:28px;">Data Preprocessing</span>
<a id = "datapreprocessing" ></a>

<a id = "1111" ></a>
#### **Splitting the data into train and test sets**
<a id = "Split" ></a>

In [None]:
def object_to_int(dataframe_series):
    if dataframe_series.dtype=='object':
        dataframe_series = LabelEncoder().fit_transform(dataframe_series)
    return dataframe_series

In [None]:
df = df.apply(lambda x: object_to_int(x))
df.head()

In [None]:
plt.figure(figsize=(15,10))
df.corr()['Churn'].sort_values(ascending = False).plot(kind="barh", 
                                                       title="Feature Correlation with target ('Churn')",
                                                       ylabel="correlation",
                                                       xlabel="features")
plt.show()

In [None]:
X = df.drop(columns = ['Churn'])
y = df['Churn'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30, random_state = 40, stratify=y)

In [None]:
def distplot(feature, frame, color='r'):
    plt.figure(figsize=(15,6))
    plt.title("Distribution for {}".format(feature))
    ax = sns.distplot(frame[feature], color= color)
    plt.show()

In [None]:
num_cols = ["tenure", 'MonthlyCharges', 'TotalCharges']

for feat in num_cols: distplot(feat, df)

Since the numerical features are distributed over different value ranges, I will use standard scalar to scale them down to the same range.

<a id = "111" ></a>
#### **Standardizing numeric attributes**
<a id = "Standardizing" ></a>

In [None]:
df_std = pd.DataFrame(StandardScaler().fit_transform(df[num_cols].astype('float64')),
                       columns=num_cols)
for feat in numerical_cols: distplot(feat, df_std, color='c')

In [None]:
# Divide the columns into 3 categories, one ofor standardisation, one for label encoding and one for one hot encoding

cat_cols_ohe =['PaymentMethod', 'Contract', 'InternetService'] # those that need one-hot encoding
cat_cols_le = list(set(X_train.columns)- set(num_cols) - set(cat_cols_ohe)) #those that need label encoding

In [None]:
scaler= StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

<a id = "11" ></a>
# <span style="font-family:serif; font-size:28px;">Machine Learning Model Evaluations and Predictions</span>
<a id = "modelprediction" ></a>

<a id = "101" ></a>
#### <b> KNN</b>
<a id = "knn" ></a>

In [None]:
knn_model = KNeighborsClassifier(n_neighbors = 11) 
knn_model.fit(X_train,y_train)
predicted_y = knn_model.predict(X_test)
accuracy_knn = knn_model.score(X_test,y_test)
print("KNN accuracy:",accuracy_knn)

In [None]:
print(classification_report(y_test, predicted_y))

In [None]:
def PlotRoc(model, title):
    from sklearn.metrics import roc_auc_score
    
    pred_prob = model.predict_proba(X_test)[:, 1]
    # Compute ROC curve and AUC score
    fpr_rf, tpr_rf, thresholds = roc_curve(y_test, pred_prob)
    auc_score = roc_auc_score(y_test, pred_prob)
    
    # Plot ROC curve
    plt.figure(figsize=(15, 6))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr_rf, tpr_rf, label=f'{title} (AUC = {auc_score:.4f})', color="r")
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{title} ROC Curve', fontsize=16)
    plt.legend(loc='best')
    plt.show()
    
def PlotConfusionMatrix(y_test, prediction_test, title):
    plt.figure(figsize=(15, 7))
    sns.heatmap(confusion_matrix(y_test, prediction_test),
                annot=True,fmt = "d",linecolor="k",linewidths=3)

    plt.title(f"{title} CONFUSION MATRIX",fontsize=14)
    plt.show()

In [None]:
PlotConfusionMatrix(y_test, predicted_y, "KNN")

In [None]:
PlotRoc(knn_model, "KNN")

<a id = "102" ></a>
#### <b>SVC</b>
<a id = "svc" ></a>

In [None]:
svc_model = SVC(random_state = 1, probability=True)
svc_model.fit(X_train,y_train)
predict_y = svc_model.predict(X_test)
accuracy_svc = svc_model.score(X_test,y_test)
print("SVM accuracy is :",accuracy_svc)

In [None]:
print(classification_report(y_test, predict_y))

In [None]:
PlotConfusionMatrix(y_test, predict_y, "SVC")

In [None]:
PlotRoc(svc_model, "KNN")

<a id = "103" ></a>
#### <b> Random Forest</b>
<a id = "rf" ></a>

In [None]:
model_rf = RandomForestClassifier(n_estimators=500 , oob_score = True, n_jobs = -1,
                                  random_state =50, max_features = "auto",
                                  max_leaf_nodes = 30)
model_rf.fit(X_train, y_train)

# Make predictions
prediction_test = model_rf.predict(X_test)
print (metrics.accuracy_score(y_test, prediction_test))

In [None]:
print(classification_report(y_test, prediction_test))

In [None]:
PlotConfusionMatrix(y_test, prediction_test, "RandomForest")

In [None]:
PlotRoc(model_rf, "RandomForest")

<a id = "104" ></a>
#### <b>Logistic Regression</b>
<a id = "lr" ></a>

In [None]:
lr_model = LogisticRegression()
lr_model.fit(X_train,y_train)
accuracy_lr = lr_model.score(X_test,y_test)
print("Logistic Regression accuracy is :",accuracy_lr)

In [None]:
lr_pred= lr_model.predict(X_test)
report = classification_report(y_test,lr_pred)
print(report)

In [None]:
PlotConfusionMatrix(y_test, lr_pred, "Logistic Regression")

In [None]:
PlotRoc(lr_model, "Logistic Regression")

<a id = "105" ></a>
#### **Decision Tree Classifier**
<a id = "dtc" ></a>

In [None]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train,y_train)
predictdt_y = dt_model.predict(X_test)
accuracy_dt = dt_model.score(X_test,y_test)
print("Decision Tree accuracy is :",accuracy_dt)

Decision tree gives very low score.

In [None]:
print(classification_report(y_test, predictdt_y))

In [None]:
PlotConfusionMatrix(y_test, predicted_y, "Decision Tree")

In [None]:
PlotRoc(dt_model, "Logistic Regression")

<a id = "106" ></a>
#### **AdaBoost Classifier**
<a id = "ada" ></a>

In [None]:
a_model = AdaBoostClassifier()
a_model.fit(X_train,y_train)
a_preds = a_model.predict(X_test)
print("AdaBoost Classifier accuracy")
metrics.accuracy_score(y_test, a_preds)

In [None]:
print(classification_report(y_test, a_preds))

In [None]:
PlotConfusionMatrix(y_test, a_preds, "AdaBoost Classifier")

In [None]:
PlotRoc(a_model, "AdaBoost Classifier")

<a id = "107" ></a>
#### **Gradient Boosting Classifier**
<a id = "gb" ></a>

In [None]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_test)
print("Gradient Boosting Classifier", accuracy_score(y_test, gb_pred))

In [None]:
print(classification_report(y_test, gb_pred))

In [None]:
PlotConfusionMatrix(y_test, gb_pred, "Gradient Boosting Classifier")

<a id = "108" ></a>
#### **Voting Classifier**
<a id = "vc" ></a>
Let's now predict the final model based on the highest majority of voting and check it's score.

In [None]:
from sklearn.ensemble import VotingClassifier
clf1 = GradientBoostingClassifier()
clf2 = LogisticRegression()
clf3 = AdaBoostClassifier()
eclf1 = VotingClassifier(estimators=[('gbc', clf1), ('lr', clf2), ('abc', clf3)], voting='soft')
eclf1.fit(X_train, y_train)
predictions = eclf1.predict(X_test)
print("Final Accuracy Score ")
print(accuracy_score(y_test, predictions))

In [None]:
print(classification_report(y_test, predictions))

In [None]:
PlotConfusionMatrix(y_test, gb_pred, "VOTING CLASSIFIER")

In [None]:
PlotRoc(eclf1, "AdaBoost Classifier")

From the confusion matrix we can see that: 
There are total 1400+149=1549 actual non-churn values and the algorithm predicts 1400 of them as non churn and 149 of them as churn.
While there are 237+324=561 actual churn values and the algorithm predicts 237 of them as non churn values and 324 of them as churn values.

Customer churn is definitely bad to a firm ’s profitability. Various strategies can be implemented to eliminate customer churn. The best way to avoid customer churn is for a company to truly know its customers. This includes identifying customers who are at risk of churning and working to improve their satisfaction. Improving customer service is, of course, at the top of the priority for tackling this issue. Building customer loyalty through relevant experiences and specialized service is another strategy to reduce customer churn. Some firms survey customers who have already churned to understand their reasons for leaving in order to adopt a proactive approach to avoiding future customer churn. 

### Hyperparameter Optimization

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Initialize classifiers
clf1 = GradientBoostingClassifier()
clf2 = LogisticRegression(max_iter=1000)
clf3 = AdaBoostClassifier()

# Create a VotingClassifier
eclf1 = VotingClassifier(estimators=[('gbc', clf1), ('lr', clf2), ('abc', clf3)], voting='soft')

# Define the parameter grid for each classifier
param_grid = {
    'gbc__n_estimators': [50, 100],
    'gbc__learning_rate': [0.01, 0.1, 0.5],
    'gbc__max_depth': [3, 5, 7],
    'lr__C': [0.1, 1, 10],
    'lr__solver': ['lbfgs', 'liblinear'],
    'abc__n_estimators': [50, 100],
    'abc__learning_rate': [0.5, 1.0, 1.5],
}


grid = RandomizedSearchCV(estimator=eclf1, param_distributions=param_grid, cv=5, scoring='accuracy')

grid.fit(X_train, y_train)

print("Best parameters found: ", grid.best_params_)
print("Best cross-validation score: ", grid.best_score_)

best_model = grid.best_estimator_
predictions = best_model.predict(X_test)
print("Final Accuracy Score: ", accuracy_score(y_test, predictions))

In [None]:
best_model.fit(X_train, y_train)
best_model_pred = best_model.predict(X_test)
print("Fine-tuned model accuracy:", accuracy_score(y_test, best_model_pred))

In [None]:
print(classification_report(y_test, best_model_pred))

In [None]:
PlotConfusionMatrix(y_test, best_model_pred, "Fine-tuned model")

In [None]:
PlotRoc(best_model, "Fine-tuned model")

## ML EXPLAINABILITY WITH SHAP

In [None]:
import shap

shap.initjs()

In [None]:
explainer = shap.Explainer(eclf1.predict, X_train)
shap_values = explainer(X_test.iloc[:100, :])

In [None]:
shap_values

In [None]:
shap_values[0]

In [None]:
shap.plots.waterfall(shap_values[0])

* From te above plot, Base Value (E[f(x)]) = 0.23 which is the average prediction made by the model accross all instance in the dataset. It's the expected value before any specific instance features are taken into account. In this case, i would be taking the first datapoint in the dataset as reference. The prediction f(x) = 0 and the features that contributed negatively to the models performance are Contract (-0.21) , tenure (-0.09), TechSupport (-0.04), DeviceProtection (-0.02), OnlineBackup (-0.01), while the features that contributed positively are MonthlyCharges (+0.07), TotalCharges(+0.03), InternetService (+0.01), PaperlessBilling (+0.01), Other features (+0.03).

f(x) = 0.23 - 0.21 - 0.09 + 0.07 -0.04 + 0.03 - 0.02 + 0.01 - 0.01 + 0.01 + 0.03 = 0

In [None]:
shap.plots.beeswarm(shap_values, plot_size=(15, 7))

* From the plot above, longer tenure increases the prediction, while shorter tenure decreases it; high monthly charges positively impact the prediction, while lower charges have a negative effect; and features like Contract, TechSupport, and OnlineSecurity show mixed effects depending on their presence.

In [None]:
shap.force_plot(shap_values.base_values[0], shap_values[0].values, X_test.iloc[0])

* The model predicts a relatively low outcome (0.02), and the strongest contributors appear to be "tenure" and "TotalCharges."

### Top model Comparison

In [None]:
from sklearn.metrics import roc_curve, auc

y_scores_model1 = eclf1.predict_proba(X_test)[:, 1]  
y_scores_model2 = a_model.predict_proba(X_test)[:, 1]  
y_scores_model3 = knn_model.predict_proba(X_test)[:, 1] 
y_scores_model4 = svc_model.predict_proba(X_test)[:, 1] 
y_scores_model5 = best_model.predict_proba(X_test)[:, 1] 

fpr1, tpr1, _ = roc_curve(y_test, y_scores_model1)
roc_auc1 = auc(fpr1, tpr1)

fpr2, tpr2, _ = roc_curve(y_test, y_scores_model2)
roc_auc2 = auc(fpr2, tpr2)

fpr3, tpr3, _ = roc_curve(y_test, y_scores_model3)
roc_auc3 = auc(fpr3, tpr3)

fpr4, tpr4, _ = roc_curve(y_test, y_scores_model4)
roc_auc4 = auc(fpr4, tpr4)

fpr5, tpr5, _ = roc_curve(y_test, y_scores_model5)
roc_auc5 = auc(fpr5, tpr5)

# Plot ROC curves
plt.figure(figsize=(15, 6))
plt.plot(fpr1, tpr1, color='blue', lw=2, label='Model 1 (AUC = {:.2f})'.format(roc_auc1))
plt.plot(fpr2, tpr2, color='red', lw=2, label='Model 2 (AUC = {:.2f})'.format(roc_auc2))
plt.plot(fpr3, tpr3, color='green', lw=2, label='Model 3 (AUC = {:.2f})'.format(roc_auc3))
plt.plot(fpr4, tpr4, color='orange', lw=2, label='Model 4 (AUC = {:.2f})'.format(roc_auc4))
plt.plot(fpr5, tpr5, color='k', lw=2, label='Model 5 (AUC = {:.2f})'.format(roc_auc5))

plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid()
plt.show()