# Importing Libraries

In [21]:
import numpy as np 
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load Dataset

In [22]:
df=pd.read_csv("Telco Churn Dataset.csv")
df.head(10)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
5,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
6,1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,...,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
7,6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
8,7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,...,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
9,6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,...,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


# Performing EDA

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [24]:
df.shape

(7043, 21)

In [25]:
df.describe()   #Statistical Description of Dataset

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [26]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

**No Missing Values**

In [27]:
df.duplicated().sum()

0

**No Duplicated Values**

# Dropping Irrelevent Columns

In [28]:
df.drop(columns=["customerID"],inplace=True)
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [29]:
# Clean column
df["TotalCharges"]=pd.to_numeric(df["TotalCharges"], errors="coerce")
df.dropna(inplace=True)
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [30]:
# Target encoding
df["Churn"].map({"Yes": 1, "No": 0})
X=df.drop(["Churn"], axis=1)
y=df["Churn"]

In [31]:
# Feature Types
numerical_features=["tenure", "MonthlyCharges", "TotalCharges"]
categorical_features=list(set(X.columns)-set(numerical_features))

# Preprocessing Pipeline

In [32]:
# Numeric Transformer
numeric_transformer=Pipeline(steps=[("scaler", StandardScaler())])

# Categorical Transformer
categorical_transformer=Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])
# Preprocessing
preprocessor=ColumnTransformer(transformers=[("num", numeric_transformer, numerical_features),("cat", categorical_transformer, categorical_features)])

# Model Pipeline

In [33]:
# Logistic Regression Model
logistic_pipeline=Pipeline(steps=[("preprocessor", preprocessor),("classifier", LogisticRegression(max_iter=1000))])

In [34]:
# Random Forest Model
rf_pipeline=Pipeline(steps=[("preprocessor", preprocessor),("classifier", RandomForestClassifier(random_state=42))])

# Train Test Split

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Hyperparameter Tuning

In [36]:
logistic_params={"classifier__C": [0.01, 0.1, 1, 10],"classifier__solver": ["lbfgs"]}
logistic_gs = GridSearchCV(logistic_pipeline,logistic_params,cv=5,scoring="accuracy",n_jobs=-1)

In [37]:
rf_params={"classifier__n_estimators": [100, 200],"classifier__max_depth": [None, 10, 20],"classifier__min_samples_split": [2, 5]}
rf_gs=GridSearchCV(rf_pipeline,rf_params,cv=5,scoring="accuracy",n_jobs=-1)

# Model Training

In [38]:
# Train models
logistic_gs.fit(X_train, y_train)
rf_gs.fit(X_train, y_train)

# Model Evaluation

In [39]:
print("\nBest Logistic Regression Model:")
print(logistic_gs.best_params_)
print(classification_report(y_test, logistic_gs.predict(X_test)))

print("\nBest Random Forest Model:")
print(rf_gs.best_params_)
print(classification_report(y_test, rf_gs.predict(X_test)))


Best Logistic Regression Model:
{'classifier__C': 0.01, 'classifier__solver': 'lbfgs'}
              precision    recall  f1-score   support

          No       0.84      0.90      0.87      1033
         Yes       0.65      0.51      0.57       374

    accuracy                           0.80      1407
   macro avg       0.74      0.71      0.72      1407
weighted avg       0.79      0.80      0.79      1407


Best Random Forest Model:
{'classifier__max_depth': 10, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
              precision    recall  f1-score   support

          No       0.84      0.89      0.86      1033
         Yes       0.64      0.52      0.57       374

    accuracy                           0.79      1407
   macro avg       0.74      0.71      0.72      1407
weighted avg       0.78      0.79      0.79      1407



# Save Best Model

In [40]:
best_model=rf_gs.best_estimator_
joblib.dump(best_model, "saved_pipeline/churn_model.joblib")
print("\nPipeline saved successfully!")


Pipeline saved successfully!
