# Classification
### Deciding if the customer will churn or not?
Retaining loyal customer and increasing the baseline revenue is ideal for the growth of a company-
Churn is not.


I first found this dataset from a youtuber, later discovering it's another popular dataset on Kaggle.com
https://www.kaggle.com/datasets/blastchar/telco-customer-churn

In [4]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split


## ETL

In [5]:
# Loading & Cleaning Data
churn_data = pd.read_csv('data/Churn.csv')
#print (churn_data.columns)
#print (churn_data.shape) 
#print (churn_data.describe())

churn_clean = churn_data.drop(['Customer ID'], axis=1) # drop irrelevant column
churn_clean = churn_clean.replace(' ', np.nan)

# Convert to appropiate data types
churn_clean['Total Charges'] = pd.to_numeric(churn_clean['Total Charges'])
churn_clean['Monthly Charges'] = pd.to_numeric(churn_clean['Monthly Charges']) 

# Convert Binary categorical columns to 1 and 0
yes_no_cols = ['Partner', 'Dependents', 'Phone Service', 'Paperless Billing', 'Churn']
for col in yes_no_cols:
    churn_clean[col] = churn_clean[col].map({'Yes': 1, 'No': 0})

churn_clean["Gender"] = churn_clean['Gender'].map({'Male': 1, 'Female': 0})
print (churn_clean.info())
churn_clean.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7044 entries, 0 to 7043
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             7044 non-null   int64  
 1   Senior Citizen     7044 non-null   int64  
 2   Partner            7044 non-null   int64  
 3   Dependents         7044 non-null   int64  
 4   tenure             7044 non-null   int64  
 5   Phone Service      7044 non-null   int64  
 6   Multiple Lines     7044 non-null   object 
 7   Internet Service   7044 non-null   object 
 8   Online Security    7044 non-null   object 
 9   Online Backup      7044 non-null   object 
 10  Device Protection  7044 non-null   object 
 11  Tech Support       7044 non-null   object 
 12  Streaming TV       7044 non-null   object 
 13  Streaming Movies   7044 non-null   object 
 14  Contract           7044 non-null   object 
 15  Paperless Billing  7044 non-null   int64  
 16  Payment Method     7044 

Unnamed: 0,Gender,Senior Citizen,Partner,Dependents,tenure,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn
0,0,0,1,0,1,0,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,1,Electronic check,29.85,29.85,0
1,0,0,1,0,1,0,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,1,Electronic check,29.85,29.85,0
2,1,0,0,0,34,1,No,DSL,Yes,No,Yes,No,No,No,One year,0,Mailed check,56.95,1889.5,0
3,1,0,0,0,2,1,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,1,Mailed check,53.85,108.15,1
4,1,0,0,0,45,0,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,0,Bank transfer (automatic),42.3,1840.75,0


## Exploratory Data Analysis

In [6]:
# Which columns are which data types?

print("--- Data Type Counts ---")
print(churn_clean.dtypes.value_counts())
print("\n" + "="*40 + "\n")

categorical_cols = churn_clean.select_dtypes(include=['object']).columns.tolist()
print(f"--- Categorical Attributes ({len(categorical_cols)}) ---")
print(*categorical_cols, sep='\n')
print("\n" + "="*40 + "\n")

numerical_cols = churn_clean.select_dtypes(include=['float64', 'int64']).columns.tolist()
print(f"--- Numerical or Binary Attributes({len(numerical_cols)}) ---")
print(*numerical_cols, sep='\n')

--- Data Type Counts ---
object     10
int64       8
float64     2
Name: count, dtype: int64


--- Categorical Attributes (10) ---
Multiple Lines
Internet Service
Online Security
Online Backup
Device Protection
Tech Support
Streaming TV
Streaming Movies
Contract
Payment Method


--- Numerical or Binary Attributes(10) ---
Gender
Senior Citizen
Partner
Dependents
tenure
Phone Service
Paperless Billing
Monthly Charges
Total Charges
Churn


In [7]:
# TODO: % Distribution of each value in categorical columns
for col in churn_data.drop(columns=['Customer ID', 'Total Charges'], axis=1).select_dtypes(include=['object']).columns:
    print(f"\nValue counts for column: {col}")
    print((churn_data[col].value_counts(normalize=True) * 100))


Value counts for column: Gender
Gender
Male      50.468484
Female    49.531516
Name: proportion, dtype: float64

Value counts for column: Partner
Partner
No     51.689381
Yes    48.310619
Name: proportion, dtype: float64

Value counts for column: Dependents
Dependents
No     70.045429
Yes    29.954571
Name: proportion, dtype: float64

Value counts for column: Phone Service
Phone Service
Yes    90.303805
No      9.696195
Name: proportion, dtype: float64

Value counts for column: Multiple Lines
Multiple Lines
No                  48.126065
Yes                 42.177740
No phone service     9.696195
Name: proportion, dtype: float64

Value counts for column: Internet Service
Internet Service
Fiber optic    43.952300
DSL            34.383873
No             21.663827
Name: proportion, dtype: float64

Value counts for column: Online Security
Online Security
No                     49.673481
Yes                    28.662692
No internet service    21.663827
Name: proportion, dtype: float64

Valu

In [None]:
# TODO: Bivariate Analysis FIX THIS
# Correlation Matrix
corr_matrix = churn_data.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm')    
plt.title('Correlation Matrix')
plt.show()

# Pairplot
sns.pairplot(churn_data, hue='Churn')
plt.show()

In [21]:
# TODO: pipeline for data cleaning (extremity elim), feature engineering, modeling
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline # Fucking pipelines are awesome
from sklearn.impute import SimpleImputer # For handling missing values
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer # For handling mixed types of features for each column
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, confusion_matrix




X = churn_clean.drop('Churn', axis=1)
y = churn_clean['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)   

numerical_feats = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_feats = X_train.select_dtypes(include=['object']).columns.tolist()


print (f"Numerical Columns: {numerical_feats}")
print (f"Categorical Columns: {categorical_feats}")
print("\n" + "="*40 + "\n")

# Magical Pipelines
numeric_preprocessor = Pipeline(
    steps=[
        ("imputation_mean", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)

categorical_preprocessor = Pipeline(
    steps=[
        (
            "imputation_constant", # Impute missing categorical values with a constant string
            SimpleImputer(fill_value="missing", strategy="most_frequent"),
        ),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

pipe = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('numerical', numeric_preprocessor, numerical_feats),
            ('categorical', categorical_preprocessor, categorical_feats)
        ]),
     
     ),
    ('classifier', None)  # Placeholder for the model to be added later
])


# Define the parameter grids for different models
# Note: <step>__<parameter_name> is used to specify parameters for the classifier step in the pipeline
# TODO: Select models and hyperparameters to tune
param_grid = [
    {
        'classifier': [LogisticRegression(random_state=42, max_iter=1000)],
        'classifier__C': [0.1, 1.0, 10],
        'classifier__solver': ['liblinear']
    },
    {
        'classifier': [RandomForestClassifier(random_state=42)],
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [5, 10, None]
    },
    {
        'classifier': [SVC(random_state=42, probability=True)],
        'classifier__C': [0.1, 1.0, 10],
        'classifier__gamma': ['scale', 'auto']
    }
]

# Run GridSearchCV
grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)


# Evaluate on test set
best_model = grid_search.best_estimator_   
y_pred = best_model.predict(X_test)
print(f"Best Model:, {grid_search.best_params_}")
print (f"Best score:, {grid_search.best_score_:.4f}")
print("\n" + "="*40 + "\n")

Numerical Columns: ['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'tenure', 'Phone Service', 'Paperless Billing', 'Monthly Charges', 'Total Charges']
Categorical Columns: ['Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 'Payment Method']


Fitting 5 folds for each of 15 candidates, totalling 75 fits
Best Model:, {'classifier': SVC(probability=True, random_state=42), 'classifier__C': 1.0, 'classifier__gamma': 'auto'}
Best score:, 0.8039




In [1]:
# Evaluation Metrics
# TODO: prettify this output, maybe use a nicer table or visualization + runner ups
print("Test Set Classification Report:")
print ("The winner model is: ", grid_search.best_params_)
print (grid_search.best_estimator_.named_steps['classifier'].__class__.__name__) # just the model name
print(classification_report(y_test, y_pred))

Test Set Classification Report:


NameError: name 'grid_search' is not defined

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [26]:
# Runner ups
# TODO: prettify this
results = pd.DataFrame(grid_search.cv_results_)
results = results.sort_values(by='mean_test_score', ascending=False)
print("Top 5 Models:")
results.head(5)


Top 5 Models:


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__C,param_classifier__solver,param_classifier__max_depth,param_classifier__n_estimators,param_classifier__gamma,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
12,6.213382,0.122839,0.324023,0.045965,"SVC(probability=True, random_state=42)",1.0,,,,auto,"{'classifier': SVC(probability=True, random_st...",0.810115,0.800355,0.805679,0.803904,0.799468,0.803904,0.003847,1
2,0.040983,0.00763,0.0135,0.00678,"LogisticRegression(max_iter=1000, random_state...",10.0,liblinear,,,,{'classifier': LogisticRegression(max_iter=100...,0.807453,0.797693,0.810115,0.791482,0.81189,0.803727,0.007845,2
11,6.424695,0.189446,0.321411,0.058839,"SVC(probability=True, random_state=42)",1.0,,,,scale,"{'classifier': SVC(probability=True, random_st...",0.816327,0.797693,0.805679,0.80213,0.795918,0.803549,0.007243,3
1,0.031345,0.002632,0.008976,0.001991,"LogisticRegression(max_iter=1000, random_state...",1.0,liblinear,,,,{'classifier': LogisticRegression(max_iter=100...,0.811003,0.794144,0.807453,0.791482,0.808341,0.802484,0.008027,4
0,0.03329,0.003098,0.01155,0.003818,"LogisticRegression(max_iter=1000, random_state...",0.1,liblinear,,,,{'classifier': LogisticRegression(max_iter=100...,0.809228,0.793256,0.810115,0.790594,0.806566,0.801952,0.008312,5


In [None]:
#TODO: 
# Visualization: Correlation Heatmap, Feature Importance, Confusion Matrix (also Heatmap), ROC Curve   
# and: Model Evaluation, Interpretation, Visualization -> Confusion Matrix, ROC Curve, etc.