#Telco Customer Churn Prediction
This project aims to identify customers who are likely to churn
and rank them based on churn probability for retention strategies.


In [52]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report


## Dataset
- Telco Customer Churn dataset (Kaggle)
- 7,043 rows and 21 columns
- Target variable: Churn


In [53]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("blastchar/telco-customer-churn")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'telco-customer-churn' dataset.
Path to dataset files: /kaggle/input/telco-customer-churn


In [54]:

os.listdir(path)

['WA_Fn-UseC_-Telco-Customer-Churn.csv']

In [55]:
df=pd.read_csv(path+"/WA_Fn-UseC_-Telco-Customer-Churn.csv")

## Data Exploration
Initial exploration of the dataset structure, data types, and missing values.


In [56]:
print(df.head())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [57]:
df.shape

(7043, 21)

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


## Data Preprocessing
- Removed customerID to prevent data leakage
- Converted TotalCharges to numeric
- One-hot encoded categorical variables


In [59]:
X=df.drop(["Churn","customerID"],axis=1)
y=df["Churn"]

In [60]:
X.head()


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65


In [61]:
X=pd.get_dummies(X,drop_first=True)
print(X.shape)

(7043, 6559)


In [62]:
df['TotalCharges']=pd.to_numeric(df['TotalCharges'],errors='coerce')

In [63]:
df["TotalCharges"].isnull().sum()

np.int64(11)

In [64]:
df['TotalCharges'].fillna(df["TotalCharges"].median(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df["TotalCharges"].median(),inplace=True)


In [65]:
X=df.drop(["Churn","customerID"],axis=1)
X=pd.get_dummies(X,drop_first=True)
y=df["Churn"]

In [66]:
X.shape

(7043, 30)

In [67]:
X.columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',
       'gender_Male', 'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No internet service', 'OnlineBackup_Yes',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No internet service', 'StreamingMovies_Yes',
       'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')

In [68]:
X.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,False,True,False,False,True,False,...,False,False,False,False,False,False,True,False,True,False
1,0,34,56.95,1889.5,True,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,True
2,0,2,53.85,108.15,True,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,True
3,0,45,42.3,1840.75,True,False,False,False,True,False,...,False,False,False,False,True,False,False,False,False,False
4,0,2,70.7,151.65,False,False,False,True,False,False,...,False,False,False,False,False,False,True,False,True,False


## Model Training
A Random Forest Classifier was trained with class weighting
to handle class imbalance in churn prediction.


In [69]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [71]:
model=RandomForestClassifier(n_estimators=100,
                             max_depth=10,
                             class_weight="balanced",
                             random_state=42)
model.fit(X_train,y_train)

## Model Evaluation
The model was evaluated on unseen test data using accuracy,
confusion matrix, and classification report.


In [72]:
y_pred=model.predict(X_test)


In [73]:
print("Accuracy:",accuracy_score(y_test,y_pred))
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("\nCllasification Report:\n",classification_report(y_test,y_pred))

Accuracy: 0.7892122072391767
Confusion Matrix:
 [[825 211]
 [ 86 287]]

Cllasification Report:
               precision    recall  f1-score   support

          No       0.91      0.80      0.85      1036
         Yes       0.58      0.77      0.66       373

    accuracy                           0.79      1409
   macro avg       0.74      0.78      0.75      1409
weighted avg       0.82      0.79      0.80      1409



## Feature Importance Analysis


In [75]:
feature_importance=pd.Series(
    model.feature_importances_,
    index=X.columns).sort_values(ascending=False)
print(feature_importance)


tenure                                   0.173967
TotalCharges                             0.144912
MonthlyCharges                           0.112135
Contract_Two year                        0.100654
InternetService_Fiber optic              0.062307
PaymentMethod_Electronic check           0.046695
Contract_One year                        0.038839
OnlineSecurity_Yes                       0.037896
TechSupport_Yes                          0.028761
DeviceProtection_No internet service     0.018504
PaperlessBilling_Yes                     0.018118
OnlineSecurity_No internet service       0.017994
OnlineBackup_Yes                         0.015959
Dependents_Yes                           0.015682
gender_Male                              0.013878
Partner_Yes                              0.013686
StreamingMovies_Yes                      0.012613
MultipleLines_Yes                        0.012459
DeviceProtection_Yes                     0.012340
PaymentMethod_Credit card (automatic)    0.011553


## Churn Probability and Risk Ranking


In [76]:
y_proba=model.predict_proba(X_test)[:,1]

In [79]:
risk_df=pd.DataFrame({
    "churn_probability":y_proba,
    "true":y_test.values},index=X_test.index).sort_values("churn_probability",ascending=False)
risk_df.head(50)

Unnamed: 0,churn_probability,true
4800,0.947841,Yes
534,0.940498,Yes
1410,0.939497,Yes
970,0.938323,Yes
2191,0.936032,Yes
2577,0.934945,Yes
2208,0.933868,Yes
1371,0.932439,Yes
3346,0.930548,No
585,0.926407,Yes


## Business Action Mapping


In [82]:
def action(p):
  if p>=0.75:
    return " High risk immediate retention"
  elif p>=0.5:
    return "Medium risk discount offer"
  else:
    return "Low risk"
risk_df["action"]=risk_df["churn_probability"].apply(action)
risk_df.head(50)

Unnamed: 0,churn_probability,true,action
4800,0.947841,Yes,High risk immediate retention
534,0.940498,Yes,High risk immediate retention
1410,0.939497,Yes,High risk immediate retention
970,0.938323,Yes,High risk immediate retention
2191,0.936032,Yes,High risk immediate retention
2577,0.934945,Yes,High risk immediate retention
2208,0.933868,Yes,High risk immediate retention
1371,0.932439,Yes,High risk immediate retention
3346,0.930548,No,High risk immediate retention
585,0.926407,Yes,High risk immediate retention
