In [188]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns


from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [164]:
df = pd.read_csv("Data/telco.csv")
df

Unnamed: 0,Customer ID,Gender,Age,Under 30,Senior Citizen,Married,Dependents,Number of Dependents,Country,State,...,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Satisfaction Score,Customer Status,Churn Label,Churn Score,CLTV,Churn Category,Churn Reason
0,8779-QRDMV,Male,78,No,Yes,No,No,0,United States,California,...,20,0.00,59.65,3,Churned,Yes,91,5433,Competitor,Competitor offered more data
1,7495-OOKFY,Female,74,No,Yes,Yes,Yes,1,United States,California,...,0,390.80,1024.10,3,Churned,Yes,69,5302,Competitor,Competitor made better offer
2,1658-BYGOY,Male,71,No,Yes,No,Yes,3,United States,California,...,0,203.94,1910.88,2,Churned,Yes,81,3179,Competitor,Competitor made better offer
3,4598-XLKNJ,Female,78,No,Yes,Yes,Yes,1,United States,California,...,0,494.00,2995.07,2,Churned,Yes,88,5337,Dissatisfaction,Limited range of services
4,4846-WHAFZ,Female,80,No,Yes,Yes,Yes,1,United States,California,...,0,234.21,3102.36,2,Churned,Yes,67,2793,Price,Extra data charges
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,2569-WGERO,Female,30,No,No,No,No,0,United States,California,...,0,1639.44,3039.53,5,Stayed,No,45,5306,,
7039,6840-RESVB,Male,38,No,No,Yes,Yes,2,United States,California,...,0,865.20,2807.47,3,Stayed,No,59,2140,,
7040,2234-XADUH,Female,30,No,No,Yes,Yes,2,United States,California,...,0,2135.52,9453.04,4,Stayed,No,71,5560,,
7041,4801-JZAZL,Female,32,No,No,Yes,Yes,2,United States,California,...,0,0.00,319.21,4,Stayed,No,59,2793,,


In [165]:
df.columns

Index(['Customer ID', 'Gender', 'Age', 'Under 30', 'Senior Citizen', 'Married',
       'Dependents', 'Number of Dependents', 'Country', 'State', 'City',
       'Zip Code', 'Latitude', 'Longitude', 'Population', 'Quarter',
       'Referred a Friend', 'Number of Referrals', 'Tenure in Months', 'Offer',
       'Phone Service', 'Avg Monthly Long Distance Charges', 'Multiple Lines',
       'Internet Service', 'Internet Type', 'Avg Monthly GB Download',
       'Online Security', 'Online Backup', 'Device Protection Plan',
       'Premium Tech Support', 'Streaming TV', 'Streaming Movies',
       'Streaming Music', 'Unlimited Data', 'Contract', 'Paperless Billing',
       'Payment Method', 'Monthly Charge', 'Total Charges', 'Total Refunds',
       'Total Extra Data Charges', 'Total Long Distance Charges',
       'Total Revenue', 'Satisfaction Score', 'Customer Status', 'Churn Label',
       'Churn Score', 'CLTV', 'Churn Category', 'Churn Reason'],
      dtype='object')

# Feature Engineering

In this section, we perform feature engineering to simplify the dataset, reduce redundancy, and create meaningful new features.

---

### 1. Drop ID & Data Leakage Columns
We remove columns that are either identifiers (e.g., `Customer ID`) or contain information that would **leak the target** (e.g., `Churn Score`, `CLTV`, `Total Revenue`).

In [166]:
drop_cols = [
    "Customer ID", "Customer Status", "Churn Score", "CLTV", 
    "Churn Category", "Churn Reason", 
    "Total Charges", "Total Refunds", "Total Extra Data Charges",
    "Total Long Distance Charges", "Total Revenue"
]

df = df.drop(columns=drop_cols, errors="ignore")


### 2. Combine Family-Related Features
- Convert `Married` and `Dependents` into binary flags.  
- Create a new feature **`Family_Size`** = `Married_flag` + `Number of Dependents`.  
- Drop the original redundant columns.

In [167]:
df["Married_flag"] = df["Married"].map({"Yes": 1, "No": 0})
df["Dependents_flag"] = df["Dependents"].map({"Yes": 1, "No": 0})

df["Family_Size"] = 1 + df["Married_flag"] + df["Number of Dependents"].fillna(0)

df = df.drop(columns=["Married", "Dependents", "Number of Dependents"], errors="ignore")
df

Unnamed: 0,Gender,Age,Under 30,Senior Citizen,Country,State,City,Zip Code,Latitude,Longitude,...,Unlimited Data,Contract,Paperless Billing,Payment Method,Monthly Charge,Satisfaction Score,Churn Label,Married_flag,Dependents_flag,Family_Size
0,Male,78,No,Yes,United States,California,Los Angeles,90022,34.023810,-118.156582,...,No,Month-to-Month,Yes,Bank Withdrawal,39.65,3,Yes,0,0,1
1,Female,74,No,Yes,United States,California,Los Angeles,90063,34.044271,-118.185237,...,Yes,Month-to-Month,Yes,Credit Card,80.65,3,Yes,1,1,3
2,Male,71,No,Yes,United States,California,Los Angeles,90065,34.108833,-118.229715,...,Yes,Month-to-Month,Yes,Bank Withdrawal,95.45,2,Yes,0,1,4
3,Female,78,No,Yes,United States,California,Inglewood,90303,33.936291,-118.332639,...,Yes,Month-to-Month,Yes,Bank Withdrawal,98.50,2,Yes,1,1,3
4,Female,80,No,Yes,United States,California,Whittier,90602,33.972119,-118.020188,...,Yes,Month-to-Month,Yes,Bank Withdrawal,76.50,2,Yes,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Female,30,No,No,United States,California,Landers,92285,34.341737,-116.539416,...,No,Two Year,Yes,Bank Withdrawal,21.15,5,No,0,0,1
7039,Male,38,No,No,United States,California,Adelanto,92301,34.667815,-117.536183,...,Yes,One Year,Yes,Mailed Check,84.80,3,No,1,1,4
7040,Female,30,No,No,United States,California,Amboy,92304,34.559882,-115.637164,...,Yes,One Year,Yes,Credit Card,103.20,4,No,1,1,4
7041,Female,32,No,No,United States,California,Angelus Oaks,92305,34.167800,-116.864330,...,Yes,Month-to-Month,Yes,Bank Withdrawal,29.60,4,No,1,1,4


### 3. Simplify Location Information
- Keep only `State` for geographic grouping.  
- Create a new categorical feature **`Urban_Rural`** from `Population`:
  - **Rural**: Population ≤ 50k  
  - **Town**: 50k < Population ≤ 200k  
  - **City**: Population > 200k  
- Drop detailed location columns (`Country`, `City`, `Zip Code`, `Latitude`, `Longitude`, `Population`).

In [168]:
df["Urban_Rural"] = pd.cut(
    df["Population"],
    bins=[-1, 50000, 200000, float("inf")],
    labels=["Rural", "Town", "City"]
)

df = df.drop(columns=["Country", "City", "Zip Code", "Latitude", "Longitude", "Population"], errors="ignore")
df

Unnamed: 0,Gender,Age,Under 30,Senior Citizen,State,Quarter,Referred a Friend,Number of Referrals,Tenure in Months,Offer,...,Contract,Paperless Billing,Payment Method,Monthly Charge,Satisfaction Score,Churn Label,Married_flag,Dependents_flag,Family_Size,Urban_Rural
0,Male,78,No,Yes,California,Q3,No,0,1,,...,Month-to-Month,Yes,Bank Withdrawal,39.65,3,Yes,0,0,1,Town
1,Female,74,No,Yes,California,Q3,Yes,1,8,Offer E,...,Month-to-Month,Yes,Credit Card,80.65,3,Yes,1,1,3,Town
2,Male,71,No,Yes,California,Q3,No,0,18,Offer D,...,Month-to-Month,Yes,Bank Withdrawal,95.45,2,Yes,0,1,4,Rural
3,Female,78,No,Yes,California,Q3,Yes,1,25,Offer C,...,Month-to-Month,Yes,Bank Withdrawal,98.50,2,Yes,1,1,3,Rural
4,Female,80,No,Yes,California,Q3,Yes,1,37,Offer C,...,Month-to-Month,Yes,Bank Withdrawal,76.50,2,Yes,1,1,3,Rural
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Female,30,No,No,California,Q3,No,0,72,,...,Two Year,Yes,Bank Withdrawal,21.15,5,No,0,0,1,Rural
7039,Male,38,No,No,California,Q3,Yes,1,24,Offer C,...,One Year,Yes,Mailed Check,84.80,3,No,1,1,4,Rural
7040,Female,30,No,No,California,Q3,Yes,4,72,,...,One Year,Yes,Credit Card,103.20,4,No,1,1,4,Rural
7041,Female,32,No,No,California,Q3,Yes,1,11,,...,Month-to-Month,Yes,Bank Withdrawal,29.60,4,No,1,1,4,Rural


### 4. Group Digital Services
- Binary encode service-related features (`Yes` = 1, `No` = 0).  
- Create aggregate features:
  - **`Security_Services_Sum`** = Online Security + Online Backup + Device Protection Plan + Premium Tech Support  
  - **`Entertainment_Services_Sum`** = Streaming TV + Streaming Movies + Streaming Music + Unlimited Data  
  - **`Digital_Services_Score`** = Sum of all above services  
- Drop the original service columns after grouping.


In [169]:
security_cols = ["Online Security", "Online Backup", "Device Protection Plan", "Premium Tech Support"]
entertainment_cols = ["Streaming TV", "Streaming Movies", "Streaming Music", "Unlimited Data"]


for col in security_cols + entertainment_cols:
    df[col] = df[col].map({"Yes": 1, "No": 0})

df["Security_Services_Sum"] = df[security_cols].sum(axis=1)
df["Entertainment_Services_Sum"] = df[entertainment_cols].sum(axis=1)
df["Digital_Services_Score"] = df["Security_Services_Sum"] + df["Entertainment_Services_Sum"]

df = df.drop(columns=security_cols + entertainment_cols, errors="ignore")
df

Unnamed: 0,Gender,Age,Under 30,Senior Citizen,State,Quarter,Referred a Friend,Number of Referrals,Tenure in Months,Offer,...,Monthly Charge,Satisfaction Score,Churn Label,Married_flag,Dependents_flag,Family_Size,Urban_Rural,Security_Services_Sum,Entertainment_Services_Sum,Digital_Services_Score
0,Male,78,No,Yes,California,Q3,No,0,1,,...,39.65,3,Yes,0,0,1,Town,1,1,2
1,Female,74,No,Yes,California,Q3,Yes,1,8,Offer E,...,80.65,3,Yes,1,1,3,Town,1,1,2
2,Male,71,No,Yes,California,Q3,No,0,18,Offer D,...,95.45,2,Yes,0,1,4,Rural,0,4,4
3,Female,78,No,Yes,California,Q3,Yes,1,25,Offer C,...,98.50,2,Yes,1,1,3,Rural,2,3,5
4,Female,80,No,Yes,California,Q3,Yes,1,37,Offer C,...,76.50,2,Yes,1,1,3,Rural,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Female,30,No,No,California,Q3,No,0,72,,...,21.15,5,No,0,0,1,Rural,0,0,0
7039,Male,38,No,No,California,Q3,Yes,1,24,Offer C,...,84.80,3,No,1,1,4,Rural,3,4,7
7040,Female,30,No,No,California,Q3,Yes,4,72,,...,103.20,4,No,1,1,4,Rural,2,4,6
7041,Female,32,No,No,California,Q3,Yes,1,11,,...,29.60,4,No,1,1,4,Rural,1,1,2



### 5. Simplify Charges
- Drop detailed usage charge columns (`Avg Monthly Long Distance Charges`, `Avg Monthly GB Download`).  
- Keep only **`Monthly Charge`** as the main billing variable.

---

### 6. Cleanup Age-Related Redundancy
- Drop `Under 30` and `Senior Citizen` because **`Age` already captures this information**.



After these steps, we reduce the dataset to a **cleaner and more interpretable feature set** while keeping important customer information for churn prediction.

In [170]:
df = df.drop(columns=["Avg Monthly Long Distance Charges", "Avg Monthly GB Download"], errors="ignore")

df = df.drop(columns=["Under 30", "Senior Citizen"], errors="ignore")

df.head()

Unnamed: 0,Gender,Age,State,Quarter,Referred a Friend,Number of Referrals,Tenure in Months,Offer,Phone Service,Multiple Lines,...,Monthly Charge,Satisfaction Score,Churn Label,Married_flag,Dependents_flag,Family_Size,Urban_Rural,Security_Services_Sum,Entertainment_Services_Sum,Digital_Services_Score
0,Male,78,California,Q3,No,0,1,,No,No,...,39.65,3,Yes,0,0,1,Town,1,1,2
1,Female,74,California,Q3,Yes,1,8,Offer E,Yes,Yes,...,80.65,3,Yes,1,1,3,Town,1,1,2
2,Male,71,California,Q3,No,0,18,Offer D,Yes,Yes,...,95.45,2,Yes,0,1,4,Rural,0,4,4
3,Female,78,California,Q3,Yes,1,25,Offer C,Yes,No,...,98.5,2,Yes,1,1,3,Rural,2,3,5
4,Female,80,California,Q3,Yes,1,37,Offer C,Yes,Yes,...,76.5,2,Yes,1,1,3,Rural,0,1,1


In [171]:
X = df.drop(columns=['Churn Label'])
y = df['Churn Label'].map({'No': 0, 'Yes': 1}) 
X.shape

(7043, 24)

## Feature Selection and Preprocessing Pipeline

We start by separating the features into **numerical** and **categorical** types. This allows us to apply the appropriate preprocessing steps for each type of feature.



In [172]:
num_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_features = X.select_dtypes(include=["object", "category"]).columns.tolist()


numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [173]:
preprocessor

0,1,2
,transformers,"[('OneHotEncoder', ...), ('StandardScaler', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [174]:
X = preprocessor.fit_transform(X)

In [175]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape

((4930, 43), (2113, 43))

### Model Evaluation Function

We define a function to **evaluate the performance of a classification model**. The function takes two inputs:

- `true`: The true labels from the dataset  
- `predicted`: The labels predicted by the model  

The function computes and returns three important metrics:

1. **Confusion Matrix**  
   - Shows the counts of **true positives, true negatives, false positives, and false negatives**.  
   - Helps us understand where the model is making mistakes.

2. **Accuracy Score**  
   - Measures the **overall proportion of correct predictions**.  
   - Gives a quick sense of how well the model is performing.

3. **Classification Report**  
   - Provides detailed metrics for each class, including:  
     - **Precision**: How many predicted positives are actually correct  
     - **Recall**: How many actual positives were correctly predicted  
     - **F1-score**: Harmonic mean of precision and recall  
     - **Support**: Number of samples in each class  

This function is reusable for both **training and testing datasets**, allowing consistent evaluation across different models and datasets.


In [191]:
def evaluate_model(true, predicted):
    confusion_mat = confusion_matrix(true, predicted)
    accuracy_sco = accuracy_score(true, predicted)
    classification_rep = classification_report(true, predicted)


    return confusion_mat, accuracy_sco, classification_rep
    
    

### Model Training and Evaluation

We define a dictionary of machine learning models to train and evaluate. For each model:

1. **Fit the model on the training data**  
   - The model learns patterns from the features (`X_train`) and labels (`y_train`).

2. **Make predictions**  
   - Predict labels for both the **training set** and the **test set**.

3. **Evaluate model performance**  
   - Use a custom evaluation function to compute:
     - **Confusion matrix**: Shows correct and incorrect predictions for each class  
     - **Accuracy score**: Overall proportion of correct predictions  
     - **Classification report**: Precision, recall, F1-score, and support for each class

4. **Store results**  
   - Save **training and testing accuracies** for each model for comparison.

5. **Print summary**  
   - Display model name and key metrics for quick comparison.

Finally, we **create a DataFrame** to summarize and compare the performance of all models across training and test sets.


In [186]:
models = {
    "LogisticRegression": LogisticRegression(class_weight="balanced", random_state=42),
    "RandomForestClassifier": RandomForestClassifier(class_weight="balanced", random_state=42)
}

results = []
model_list = []

for name, model in models.items():
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    
    Confusion_matix, train_accuracy, Class_report = evaluate_model(y_train, y_train_pred)
    Confusion_matix, test_accuracy, Class_report = evaluate_model(y_test, y_test_pred)

    
    results.append([name, train_accuracy, test_accuracy])
    model_list.append(name) 
    
    print(name) 
    print('Model performance for Training set') 
    print(model_train_confusion) 
    print(model_train_report) 
    print('----------------------------------'*2) 
    
    print('Model performance for Test set') 
    print(model_test_confusion) 
    print(model_test_report) 
    print('='*35)
    print('\n')


results_df = pd.DataFrame(results, columns=['Model Name', 'Train Accuracy', 'Test Accuracy'])

print(results_df)

LogisticRegression
Model performance for Training set
[[3650    0]
 [   0 1280]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3650
           1       1.00      1.00      1.00      1280

    accuracy                           1.00      4930
   macro avg       1.00      1.00      1.00      4930
weighted avg       1.00      1.00      1.00      4930

--------------------------------------------------------------------
Model performance for Test set
[[1503   21]
 [  75  514]]
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1524
           1       0.96      0.87      0.91       589

    accuracy                           0.95      2113
   macro avg       0.96      0.93      0.94      2113
weighted avg       0.95      0.95      0.95      2113



RandomForestClassifier
Model performance for Training set
[[3650    0]
 [   0 1280]]
              precision    recall  f1-score   suppo

---
## Cross-Validation of Models

We use **cross-validation** to evaluate the stability and generalization of our models.  

1. **Logistic Regression**
   - Perform 5-fold cross-validation on the entire dataset (`X`, `y`)  
   - `scores_log` stores the accuracy for each fold  
   - `scores_log.mean()` gives the **average accuracy** across folds  

2. **Random Forest Classifier**
   - Similarly, perform 5-fold cross-validation  
   - `scores_ran` stores the accuracy for each fold  
   - `scores_ran.mean()` gives the **average accuracy** across folds  

3. **Output**
   - Prints the individual fold scores and the mean accuracy for both models, allowing comparison of their overall performance.


In [178]:
scores_log = cross_val_score(LogisticRegression(), X, y, cv=5)
scores_ran = cross_val_score(RandomForestClassifier(), X, y, cv=5)
print(scores_log, scores_log.mean())
print(scores_ran, scores_ran.mean())

[0.9602555  0.95031938 0.9602555  0.95454545 0.95170455] 0.9554160752306601
[0.78424414 0.92902768 0.92334989 0.9375     0.93252841] 0.9013300253242145
[0.93612491 0.95386799 0.95244855 0.96022727 0.95241477] 0.9510166986579778


In [189]:
param_grid = [
    {'penalty': ['l1'], 'C': [0.01, 0.1, 1, 10], 'solver': ['liblinear']},
    {'penalty': ['l2'], 'C': [0.01, 0.1, 1, 10], 'solver': ['lbfgs']}
]

grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)

0,1,2
,estimator,LogisticRegression()
,param_grid,"[{'C': [0.01, 0.1, ...], 'penalty': ['l1'], 'solver': ['liblinear']}, {'C': [0.01, 0.1, ...], 'penalty': ['l2'], 'solver': ['lbfgs']}]"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


### Hyperparameter Tuning

- **Logistic Regression:** GridSearchCV used for a small, discrete hyperparameter grid (`penalty`, `C`, `solver`).  
- **Random Forest:** RandomizedSearchCV used to efficiently sample from a large hyperparameter space (`n_estimators`, `max_depth`).  
- Cross-validation (`cv=5`) ensures robust evaluation and selects the best parameters for each model.


In [190]:
param_dist = {'n_estimators': randint(50, 200), 'max_depth': randint(5, 50)}
rand_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist, n_iter=50, cv=5)
rand_search.fit(X_train, y_train)


0,1,2
,estimator,RandomForestClassifier()
,param_distributions,"{'max_depth': <scipy.stats....0027E40D3CE90>, 'n_estimators': <scipy.stats....0027E40D3E540>}"
,n_iter,50
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,n_estimators,60
,criterion,'gini'
,max_depth,35
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
|