In [9]:

# 1. Data Loading and Exploration
# ● Load the dataset (CSV format)
# ● Explore the number of rows, column types, and any missing values
# ● Understand the balance of the target variable (Churn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [10]:
customer=pd.read_csv('customerChurn.csv')
print("Shape of dataset:", customer.shape)
customer.head()
print(customer.info())
print("\nMissing Values:")
print(customer.isnull().sum())

Shape of dataset: (7043, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBill

In [11]:
# Value counts of the target column
print("\nChurn Value Counts:")
print(customer['Churn'].value_counts())

# Percentage distribution
print("\nChurn Distribution (%):")
print(customer['Churn'].value_counts(normalize=True) * 100)



Churn Value Counts:
Churn
No     5174
Yes    1869
Name: count, dtype: int64

Churn Distribution (%):
Churn
No     73.463013
Yes    26.536987
Name: proportion, dtype: float64


In [12]:
# 2. Preprocessing
# ● Convert Churn to binary: Yes → 1, No → 0
# ● Handle missing or blank values in TotalCharges
# ● Encode categorical columns using Label Encoding or One-Hot Encoding
# ● Normalize or scale MonthlyCharges and TotalCharges if needed
# ● Split the data into train and test sets (80:20 or 70:30)


# Step 1: Drop customerID column (not useful for prediction)
customer.drop("customerID", axis=1, inplace=True)

# Step 2: Convert 'TotalCharges' to numeric, setting errors='coerce' to handle blanks
customer['TotalCharges'] = pd.to_numeric(customer['TotalCharges'], errors='coerce')

# Step 3: Drop rows with missing values (due to TotalCharges conversion)
customer.dropna(inplace=True)

# Step 4: Convert 'Churn' column to binary (Yes → 1, No → 0)
customer['Churn'] = customer['Churn'].map({'Yes': 1, 'No': 0})

# Step 5: Identify binary and multi-class categorical columns
cat_cols = customer.select_dtypes(include='object').columns
binary_cols = [col for col in cat_cols if customer[col].nunique() == 2]
multi_class_cols = [col for col in cat_cols if customer[col].nunique() > 2]

# Step 6: Label Encode binary categorical columns
le = LabelEncoder()
for col in binary_cols:
    customer[col] = le.fit_transform(customer[col])

# Step 7: One-Hot Encode multi-class columns
customer = pd.get_dummies(customer, columns=multi_class_cols, drop_first=True)

# Step 8: Normalize MonthlyCharges and TotalCharges
scaler = StandardScaler()
customer[['MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(customer[['MonthlyCharges', 'TotalCharges']])

# Step 9: Split into train and test sets (80:20)
X = customer.drop('Churn', axis=1)
y = customer['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training Set Shape:", X_train.shape)
print("Test Set Shape:", X_test.shape)


Training Set Shape: (5625, 30)
Test Set Shape: (1407, 30)


In [13]:
# 3. Model Training
# ● Use Logistic Regression from scikit-learn
# ● Train the model on the training set
# ● Evaluate it on the test set using:
# ○ Accuracy
# ○ Precision, Recall, F1 Score
# ○ Confusion Matrix

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Step 1: Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Step 2: Train the model
model.fit(X_train, y_train)

# Step 3: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 4: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Step 5: Print the results
print("Model Evaluation Metrics:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)

joblib.dump(model, "logistic_model.pkl")

Model Evaluation Metrics:
Accuracy : 0.8045
Precision: 0.6495
Recall   : 0.5749
F1 Score : 0.6099

Confusion Matrix:
[[917 116]
 [159 215]]


['logistic_model.pkl']