In [1]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('churn.csv')

# Quick check
print("Dataset Shape:", df.shape)
display(df.head())

Dataset Shape: (7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [2]:
df.info

<bound method DataFrame.info of       customerID  gender  SeniorCitizen Partner Dependents  tenure  \
0     7590-VHVEG  Female              0     Yes         No       1   
1     5575-GNVDE    Male              0      No         No      34   
2     3668-QPYBK    Male              0      No         No       2   
3     7795-CFOCW    Male              0      No         No      45   
4     9237-HQITU  Female              0      No         No       2   
...          ...     ...            ...     ...        ...     ...   
7038  6840-RESVB    Male              0     Yes        Yes      24   
7039  2234-XADUH  Female              0     Yes        Yes      72   
7040  4801-JZAZL  Female              0     Yes        Yes      11   
7041  8361-LTMKD    Male              1     Yes         No       4   
7042  3186-AJIEK    Male              0      No         No      66   

     PhoneService     MultipleLines InternetService OnlineSecurity  ...  \
0              No  No phone service             DSL 

In [3]:
# Force TotalCharges to be numeric, turning errors into NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check for missing values (NaN)
print(df.isnull().sum())

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64


In [4]:
#Fill the missing values with the median
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())
print(df.isnull().sum())


customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [5]:
# List of columns that have only Yes/No answers
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']

# Convert them to 0 and 1
for col in binary_cols:
    df[col] = df[col].apply(lambda x: 1 if x == 'Yes' else 0)

# Check the change
print(df[['Churn', 'Partner']].head())

   Churn  Partner
0      0        1
1      0        0
2      1        0
3      0        0
4      1        0


In [6]:
# 2. Drop the useless customerID column
df.drop('customerID', axis=1, inplace=True)

# Convert all remaining categorical variables into dummy/indicator variables

df = pd.get_dummies(df, dtype=int)

# Check the final shape of the data
print(df.shape)

(7043, 42)


In [7]:
# $X$ (Features): Needs to be the entire dataframe minus the 'Churn' column.$y$ (Target): Needs to be only the 'Churn' column.
X = df.drop('Churn', axis=1)
y = df['Churn']

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Logistic regression Model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=3000)
model.fit(X_train, y_train)

In [10]:
# We need to use the model to guess the answers for the test data (X_test).
y_pred = model.predict(X_test)
#ow to import accuracy_score from sklearn.metrics and use it to compare y_test and y_pred to get the accuracy of the model.
from sklearn.metrics import accuracy_score  
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8211497515968772


In [11]:
#calculate AUC score for Logistic Regression
from sklearn.metrics import roc_auc_score       
auc = roc_auc_score(y_test, y_pred)
print("AUC Score:", auc)
#calculate precision score for Logistic Regression
from sklearn.metrics import precision_score 
precision = precision_score(y_test, y_pred)
print("Precision Score:", precision)
#calculate recall score for Logistic Regression
from sklearn.metrics import recall_score
recall = recall_score(y_test, y_pred)
print("Recall Score:", recall)
#calculate F1 score for Logistic Regression
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)
#Calculate Matthews Correlation Coefficient (MCC Score) for Logistic Regression
from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(y_test, y_pred)
print("MCC Score:", mcc)

AUC Score: 0.7496998147132194
Precision Score: 0.6861538461538461
Recall Score: 0.5978552278820375
F1 Score: 0.6389684813753582
MCC Score: 0.5230295036857151


In [12]:
#saved model files for logistic regression in model directory
import joblib
joblib.dump(model, 'model/logistic_regression_model.pkl')   

['model/logistic_regression_model.pkl']

In [13]:
#Implement decision tree model and save the model file in model directory
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(criterion='gini',#or 'entropy'
                                   max_depth=5,# To prevent overfitting
                                   random_state=42
                                   )
dt_model.fit(X_train, y_train)

In [14]:
#make predictions with decision tree model
y_dt_pred = dt_model.predict(X_test)
#Optional probability predictions       
y_dt_pred_proba = dt_model.predict_proba(X_test)[:, 1]  # Probability of the positive class

In [15]:
#Accuracy for Decision Tree
dt_accuracy = accuracy_score(y_test, y_dt_pred)
print("Decision Tree Accuracy:", dt_accuracy)

Decision Tree Accuracy: 0.7998580553584103


In [16]:
#confusion matrix for Decision Tree
from sklearn.metrics import confusion_matrix
dt_cm = confusion_matrix(y_test, y_dt_pred)
print("Decision Tree Confusion Matrix:")
print(dt_cm)

Decision Tree Confusion Matrix:
[[890 146]
 [136 237]]


In [17]:
#calculate accuracy

dt_accuracy = accuracy_score(y_test, y_dt_pred)
print("Decision Tree Accuracy:", dt_accuracy)

#calculate and print the accuracy of the model on the test data.
from sklearn.metrics import roc_auc_score
dt_roc_auc = roc_auc_score(y_test, y_dt_pred_proba)
print("Decision Tree ROC AUC Score:", dt_roc_auc)

Decision Tree Accuracy: 0.7998580553584103
Decision Tree ROC AUC Score: 0.839999948243916


In [39]:
#caclulate precision, recall and f1 score, Matthews Correlation Coefficient (MCC Score) for decision tree
dt_precision = precision_score(y_test, y_dt_pred)
dt_recall = recall_score(y_test, y_dt_pred)
dt_f1 = f1_score(y_test, y_dt_pred)
dt_mcc = matthews_corrcoef(y_test, y_dt_pred)
print("Decision Tree Precision:", dt_precision)
print("Decision Tree Recall:", dt_recall)
print("Decision Tree F1 Score:", dt_f1)
print("Decision Tree MCC Score:", dt_mcc)

Decision Tree Precision: 0.618798955613577
Decision Tree Recall: 0.6353887399463807
Decision Tree F1 Score: 0.626984126984127
Decision Tree MCC Score: 0.49033651311209236


In [42]:
##saved model files for decision tree
joblib.dump(dt_model, 'model/decision_tree_model.pkl')

['model/decision_tree_model.pkl']

In [20]:
#K-Nearest Neighbor Classifier Model
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_knn_pred = knn_model.predict(X_test)
y_knn_pred_proba = knn_model.predict_proba(X_test)[:, 1]

In [21]:
#Evalaue K-Nearest Neighbor Classifier Model
knn_accuracy = accuracy_score(y_test, y_knn_pred)
print("K-Nearest Neighbor Accuracy:", knn_accuracy)

K-Nearest Neighbor Accuracy: 0.7764371894960965


In [22]:
#evaluate K-Nearest Neighbor Classifier Model AUC
knn_auc = roc_auc_score(y_test, y_knn_pred_proba)
print("K-Nearest Neighbor ROC AUC Score:", knn_auc)

K-Nearest Neighbor ROC AUC Score: 0.775258262858799


In [40]:
#evaluate K-Nearest Neighbor Classifier Model precision, recall and f1 score,Matthews Correlation Coefficient (MCC Score)
knn_precision = precision_score(y_test, y_knn_pred)
knn_recall = recall_score(y_test, y_knn_pred)
knn_f1 = f1_score(y_test, y_knn_pred)
knn_mcc = matthews_corrcoef(y_test, y_knn_pred)
print("K-Nearest Neighbor Precision:", knn_precision)
print("K-Nearest Neighbor Recall:", knn_recall)
print("K-Nearest Neighbor F1 Score:", knn_f1)
print("K-Nearest Neighbor MCC Score:", knn_mcc)

K-Nearest Neighbor Precision: 0.5941558441558441
K-Nearest Neighbor Recall: 0.4906166219839142
K-Nearest Neighbor F1 Score: 0.5374449339207048
K-Nearest Neighbor MCC Score: 0.394930196833009


In [41]:
#pickle the KNN model
joblib.dump(knn_model, 'model/knn_model.pkl')

['model/knn_model.pkl']

In [25]:
#Naive Bayes Classifier - Gaussian or Multinomial model
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_nb_pred = nb_model.predict(X_test)
y_nb_pred_proba = nb_model.predict_proba(X_test)[:, 1]

In [26]:
#evaluate Naive Bayes Classifier - Gaussian or Multinomial model
nb_accuracy = accuracy_score(y_test, y_nb_pred)
print("Naive Bayes Accuracy:", nb_accuracy)

Naive Bayes Accuracy: 0.6976579134137686


In [27]:
#evaluate AUC for Naive Bayes Classifier
nb_auc = roc_auc_score(y_test, y_nb_pred_proba)
print("Naive Bayes ROC AUC Score:", nb_auc)

Naive Bayes ROC AUC Score: 0.8375803513202977


In [43]:
#Evaluation metrics Naive Bayes Classifier precision, recall and f1 score,Matthews Correlation Coefficient (MCC Score)
nb_precision = precision_score(y_test, y_nb_pred)
nb_recall = recall_score(y_test, y_nb_pred)
nb_f1 = f1_score(y_test, y_nb_pred)
nb_mcc = matthews_corrcoef(y_test, y_nb_pred)
print("Naive Bayes Precision:", nb_precision)
print("Naive Bayes Recall:", nb_recall)
print("Naive Bayes F1 Score:", nb_f1)
print("Naive Bayes MCC Score:", nb_mcc)

Naive Bayes Precision: 0.4623044096728307
Naive Bayes Recall: 0.871313672922252
Naive Bayes F1 Score: 0.604089219330855
Naive Bayes MCC Score: 0.4468788841367073


In [44]:
# pickle the Naive Bayes model
joblib.dump(nb_model, 'model/nb_model.pkl')

['model/nb_model.pkl']

In [30]:
#Ensemble Model - Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_rf_pred = rf_model.predict(X_test)
y_rf_pred_proba = rf_model.predict_proba(X_test)[:, 1]

In [31]:
#Evaluation metrics Ensemble Model - Random Forest Classifier 
rf_accuracy = accuracy_score(y_test, y_rf_pred)
print("Random Forest Accuracy:", rf_accuracy)

Random Forest Accuracy: 0.7913413768630234


In [32]:
#Evaluation metrics AUC for Random Forest Classifier
rf_auc = roc_auc_score(y_test, y_rf_pred_proba)
print("Random Forest ROC AUC Score:", rf_auc)

Random Forest ROC AUC Score: 0.8364054882151397


In [45]:
#Evaluation metrics Ensemble Model - Random Forest Classifier precision, recall and f1 score,Matthews Correlation Coefficient (MCC Score)
rf_precision = precision_score(y_test, y_rf_pred)
rf_recall = recall_score(y_test, y_rf_pred)
rf_f1 = f1_score(y_test, y_rf_pred)
rf_mcc = matthews_corrcoef(y_test, y_rf_pred)
print("Random Forest Precision:", rf_precision)
print("Random Forest Recall:", rf_recall)
print("Random Forest F1 Score:", rf_f1)
print("Random Forest MCC Score:", rf_mcc)

Random Forest Precision: 0.6513409961685823
Random Forest Recall: 0.45576407506702415
Random Forest F1 Score: 0.5362776025236593
Random Forest MCC Score: 0.4178340753917747


In [46]:
#pickle the Random Forest model
joblib.dump(rf_model, 'model/random_forest_model.pkl')

['model/random_forest_model.pkl']

In [35]:
import xgboost as xgb
#Ensemble Model - XGBoost Classifier
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
y_xgb_pred = xgb_model.predict(X_test)
y_xgb_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

In [36]:
#Evaluation metrics for KNN XGBoost accuracy
xgb_accuracy = accuracy_score(y_test, y_xgb_pred)
print("XGBoost Accuracy:", xgb_accuracy)

XGBoost Accuracy: 0.7927608232789212


In [37]:
#Evalaue KNN XGBoost AUC
xgb_auc = roc_auc_score(y_test, y_xgb_pred_proba)
print("XGBoost AUC:", xgb_auc)

XGBoost AUC: 0.8400167689711925


In [47]:
#Evaluation metrics XGBoost precision, recall and f1 score,Matthews Correlation Coefficient (MCC Score)
xgb_precision = precision_score(y_test, y_xgb_pred)
xgb_recall = recall_score(y_test, y_xgb_pred)
xgb_f1 = f1_score(y_test, y_xgb_pred)
xgb_mcc = matthews_corrcoef(y_test, y_xgb_pred)
print("XGBoost Precision:", xgb_precision)
print("XGBoost Recall:", xgb_recall)
print("XGBoost F1 Score:", xgb_f1)
print("XGBoost MCC Score:", xgb_mcc)

XGBoost Precision: 0.6293929712460063
XGBoost Recall: 0.5281501340482574
XGBoost F1 Score: 0.5743440233236151
XGBoost MCC Score: 0.4417119209570612


In [50]:
#pickle the XGBoost model
joblib.dump(xgb_model, 'model/xgboost_model.pkl')

['model/xgboost_model.pkl']