In [12]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint

In [13]:
# Load the dataset
telecom_cust = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
telecom_cust

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [14]:
#checking for null values 
telecom_cust.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [15]:
# Displaying descriptive statistics for all columns in the DataFrame
telecom_cust.describe(include='all')

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
count,7043,7043,7043.0,7043,7043,7043.0,7043,7043,7043,7043,...,7043,7043,7043,7043,7043,7043,7043,7043.0,7043.0,7043
unique,7043,2,,2,2,,2,3,3,3,...,3,3,3,3,3,2,4,,6531.0,2
top,7590-VHVEG,Male,,No,No,,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,,,No
freq,1,3555,,3641,4933,,6361,3390,3096,3498,...,3095,3473,2810,2785,3875,4171,2365,,11.0,5174
mean,,,0.162147,,,32.371149,,,,,...,,,,,,,,64.761692,,
std,,,0.368612,,,24.559481,,,,,...,,,,,,,,30.090047,,
min,,,0.0,,,0.0,,,,,...,,,,,,,,18.25,,
25%,,,0.0,,,9.0,,,,,...,,,,,,,,35.5,,
50%,,,0.0,,,29.0,,,,,...,,,,,,,,70.35,,
75%,,,0.0,,,55.0,,,,,...,,,,,,,,89.85,,


In [16]:
# Displaying information about the DataFrame
telecom_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [17]:
#convert TotalCharges to numeric
telecom_cust['TotalCharges'] = pd.to_numeric(telecom_cust['TotalCharges'], errors='coerce')

In [3]:
# Data preprocessing
def preprocess_data(df):
    # Handle missing values
    df.dropna(inplace=True)
    # Encode categorical variables
    df_encoded = pd.get_dummies(df, drop_first=True, dtype='int')
    # Split features and target variable
    X = df_encoded.drop(columns=['Churn_Yes'])
    y = df_encoded['Churn_Yes']
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    # Scale features
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    scaled_X_train = scaler.transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    return scaled_X_train, scaled_X_test, y_train, y_test


In [4]:
# Model training and evaluation
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test):
    # Fit the model
    model.fit(X_train, y_train)
    # Predict
    y_pred = model.predict(X_test)
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_pred)
    return accuracy, report, auc_roc

In [5]:
# Decision Tree hyperparameter tuning
def tune_decision_tree(X_train, y_train):
    param_dist = {
        'criterion': ['gini', 'entropy'],
        'max_depth': randint(1, 20),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 20),
        'max_features': ['auto', 'sqrt', 'log2', None]
    }
    dt_classifier = DecisionTreeClassifier(random_state=42)
    random_search = RandomizedSearchCV(
        dt_classifier,
        param_distributions=param_dist,
        n_iter=100,
        cv=5,
        scoring='accuracy',
        random_state=42
    )
    random_search.fit(X_train, y_train)
    best_params = random_search.best_params_
    best_model = random_search.best_estimator_
    return best_params, best_model

In [6]:
# Main script
def main():
    # Data preprocessing
    scaled_X_train, scaled_X_test, y_train, y_test = preprocess_data(telecom_cust)
    
    # Model initialization
    knn_model = KNeighborsClassifier(n_neighbors=7)
    logreg_model = LogisticRegression(max_iter=10000, C=0.1, random_state=42)
    
    # Model training and evaluation
    models = {'kNN': knn_model, 'Logistic Regression': logreg_model}
    results = {}
    for name, model in models.items():
        accuracy, report, auc_roc = train_and_evaluate_model(model, scaled_X_train, scaled_X_test, y_train, y_test)
        results[name] = {'Accuracy': accuracy, 'Classification Report': report, 'AUC-ROC': auc_roc}
    
    # Decision Tree hyperparameter tuning
    best_params, best_model = tune_decision_tree(scaled_X_train, y_train)
    dt_accuracy, dt_report, dt_auc_roc = train_and_evaluate_model(best_model, scaled_X_train, scaled_X_test, y_train, y_test)
    results['Decision Tree'] = {'Best Parameters': best_params, 'Accuracy': dt_accuracy, 'Classification Report': dt_report, 'AUC-ROC': dt_auc_roc}
    
    # Print results
    for name, metrics in results.items():
        print(f"=== {name} ===")
        print(f"Best Parameters: {metrics.get('Best Parameters', 'N/A')}")
        print(f"Accuracy: {metrics['Accuracy']:.4f}")
        print("Classification Report:")
        print(metrics['Classification Report'])
        print(f"AUC-ROC: {metrics['AUC-ROC']:.4f}")
        print("=" * 50)

if __name__ == "__main__":
    main()

95 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
95 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\a3dullahi\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\a3dullahi\anaconda3\lib\site-packages\sklearn\base.py", line 1344, in wrapper
    estimator._validate_params()
  File "C:\Users\a3dullahi\anaconda3\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\a3dullahi\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise

=== kNN ===
Best Parameters: N/A
Accuracy: 0.7695
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.86      0.84      1539
           1       0.58      0.53      0.56       574

    accuracy                           0.77      2113
   macro avg       0.71      0.70      0.70      2113
weighted avg       0.76      0.77      0.77      2113

AUC-ROC: 0.6959
=== Logistic Regression ===
Best Parameters: N/A
Accuracy: 0.8093
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.91      0.87      1539
           1       0.69      0.54      0.61       574

    accuracy                           0.81      2113
   macro avg       0.77      0.73      0.74      2113
weighted avg       0.80      0.81      0.80      2113

AUC-ROC: 0.7260
=== Decision Tree ===
Best Parameters: {'criterion': 'entropy', 'max_depth': 4, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 3}
Acc