In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv("customer_churn_synthetic.csv")

df.head()

Unnamed: 0,CustomerID,Gender,SeniorCitizen,Partner,Dependents,Region,InternetService,OnlineSecurity,TechSupport,OnlineBackup,...,PaperlessBilling,PaymentMethod,TenureMonths,MonthlyCharges,TotalCharges,AccountStartDate,AccountEndDate,Churn,ChurnProbability,ChurnReason
0,100000,Male,0,No,Yes,Chennai,DSL,Yes,Yes,Yes,...,Yes,Electronic check,23,74.2,1705.97,2023-05-13 17:59:02,,No,0.378,
1,100001,Female,0,No,Yes,Bengaluru North,Fiber optic,Yes,Yes,No,...,No,Credit card (automatic),52,90.27,4644.31,2022-03-29 03:40:24,2026-01-13 00:00:00,Yes,0.378,Competitor offer
2,100002,Male,1,No,No,Chennai,,No,No,No,...,Yes,Electronic check,27,16.62,491.96,2025-10-25 15:11:53,2026-01-13 00:00:00,Yes,0.891,Service issues
3,100003,Male,0,Yes,No,Bengaluru North,Fiber optic,Yes,Yes,No,...,No,Mailed check,10,77.3,814.32,2023-06-03 21:51:03,2025-12-27 21:51:03,Yes,0.55,Service issues
4,100004,Male,1,Yes,No,Hyderabad,DSL,No,No,No,...,Yes,Mailed check,61,46.59,2845.61,2022-08-10 05:42:51,2026-01-13 00:00:00,Yes,0.818,Payment problems


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CustomerID        10000 non-null  int64  
 1   Gender            10000 non-null  object 
 2   SeniorCitizen     10000 non-null  int64  
 3   Partner           10000 non-null  object 
 4   Dependents        10000 non-null  object 
 5   Region            10000 non-null  object 
 6   InternetService   8505 non-null   object 
 7   OnlineSecurity    10000 non-null  object 
 8   TechSupport       10000 non-null  object 
 9   OnlineBackup      10000 non-null  object 
 10  StreamingTV       10000 non-null  object 
 11  StreamingMovies   10000 non-null  object 
 12  Contract          10000 non-null  object 
 13  PaperlessBilling  10000 non-null  object 
 14  PaymentMethod     10000 non-null  object 
 15  TenureMonths      10000 non-null  int64  
 16  MonthlyCharges    10000 non-null  float64

In [5]:
df.describe()

Unnamed: 0,CustomerID,SeniorCitizen,TenureMonths,MonthlyCharges,TotalCharges,ChurnProbability
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,104999.5,0.1511,35.3294,64.510621,2273.870095,0.659587
std,2886.89568,0.358164,20.618958,22.288727,1604.03775,0.191746
min,100000.0,0.0,0.0,15.0,0.0,0.13
25%,102499.75,0.0,17.0,55.5075,892.4175,0.525
50%,104999.5,0.0,35.0,69.175,2033.56,0.69
75%,107499.25,0.0,53.0,80.68,3507.955,0.802
max,109999.0,1.0,71.0,108.36,7072.82,0.968


In [6]:
for cols in df.columns:
    if df[cols].dtype == object:
        print(cols)
        print(df[cols].unique())

Gender
['Male' 'Female']
Partner
['No' 'Yes']
Dependents
['Yes' 'No']
Region
['Chennai' 'Bengaluru North' 'Hyderabad' 'Delhi' 'Kolkata' 'Jaipur'
 'Bengaluru South' 'Pune' 'Mumbai' 'Ahmedabad']
InternetService
['DSL' 'Fiber optic' nan]
OnlineSecurity
['Yes' 'No']
TechSupport
['Yes' 'No']
OnlineBackup
['Yes' 'No']
StreamingTV
['Yes' 'No']
StreamingMovies
['No' 'Yes']
Contract
['One year' 'Month-to-month' 'Two year']
PaperlessBilling
['Yes' 'No']
PaymentMethod
['Electronic check' 'Credit card (automatic)' 'Mailed check'
 'Bank transfer (automatic)']
AccountStartDate
['2023-05-13 17:59:02' '2022-03-29 03:40:24' '2025-10-25 15:11:53' ...
 '2019-10-23 01:09:21' '2023-03-16 06:10:41' '2022-11-25 15:29:16']
AccountEndDate
[nan '2026-01-13 00:00:00' '2025-12-27 21:51:03' ... '2024-02-25 01:09:21'
 '2025-01-17 06:10:41' '2024-12-27 15:29:16']
Churn
['No' 'Yes']
ChurnReason
[nan 'Competitor offer' 'Service issues' 'Payment problems'
 'Price too high' 'Relocation']


In [7]:
df.isna().sum()

CustomerID             0
Gender                 0
SeniorCitizen          0
Partner                0
Dependents             0
Region                 0
InternetService     1495
OnlineSecurity         0
TechSupport            0
OnlineBackup           0
StreamingTV            0
StreamingMovies        0
Contract               0
PaperlessBilling       0
PaymentMethod          0
TenureMonths           0
MonthlyCharges         0
TotalCharges           0
AccountStartDate       0
AccountEndDate      3357
Churn                  0
ChurnProbability       0
ChurnReason         3357
dtype: int64

In [8]:
# imputing values as the values are not known in this case.
features = ["ChurnReason", "InternetService"]
for cols in features:
    df[cols] = df[cols].fillna("Unknown")

In [9]:
df.isnull().sum()

CustomerID             0
Gender                 0
SeniorCitizen          0
Partner                0
Dependents             0
Region                 0
InternetService        0
OnlineSecurity         0
TechSupport            0
OnlineBackup           0
StreamingTV            0
StreamingMovies        0
Contract               0
PaperlessBilling       0
PaymentMethod          0
TenureMonths           0
MonthlyCharges         0
TotalCharges           0
AccountStartDate       0
AccountEndDate      3357
Churn                  0
ChurnProbability       0
ChurnReason            0
dtype: int64

In [10]:
df.dropna()

Unnamed: 0,CustomerID,Gender,SeniorCitizen,Partner,Dependents,Region,InternetService,OnlineSecurity,TechSupport,OnlineBackup,...,PaperlessBilling,PaymentMethod,TenureMonths,MonthlyCharges,TotalCharges,AccountStartDate,AccountEndDate,Churn,ChurnProbability,ChurnReason
1,100001,Female,0,No,Yes,Bengaluru North,Fiber optic,Yes,Yes,No,...,No,Credit card (automatic),52,90.27,4644.31,2022-03-29 03:40:24,2026-01-13 00:00:00,Yes,0.378,Competitor offer
2,100002,Male,1,No,No,Chennai,Unknown,No,No,No,...,Yes,Electronic check,27,16.62,491.96,2025-10-25 15:11:53,2026-01-13 00:00:00,Yes,0.891,Service issues
3,100003,Male,0,Yes,No,Bengaluru North,Fiber optic,Yes,Yes,No,...,No,Mailed check,10,77.30,814.32,2023-06-03 21:51:03,2025-12-27 21:51:03,Yes,0.550,Service issues
4,100004,Male,1,Yes,No,Hyderabad,DSL,No,No,No,...,Yes,Mailed check,61,46.59,2845.61,2022-08-10 05:42:51,2026-01-13 00:00:00,Yes,0.818,Payment problems
5,100005,Female,0,Yes,Yes,Delhi,Fiber optic,No,Yes,No,...,No,Bank transfer (automatic),31,80.10,2489.35,2022-03-23 01:40:46,2023-09-17 01:40:46,Yes,0.769,Competitor offer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,109995,Female,0,Yes,No,Bengaluru South,Fiber optic,Yes,Yes,No,...,Yes,Credit card (automatic),43,86.90,3759.47,2021-10-18 01:25:05,2025-02-22 01:25:05,Yes,0.198,Payment problems
9996,109996,Male,0,No,No,Mumbai,Fiber optic,Yes,No,No,...,Yes,Credit card (automatic),17,87.13,1485.47,2019-10-19 23:32:21,2020-08-26 23:32:21,Yes,0.786,Relocation
9997,109997,Female,0,No,No,Mumbai,Fiber optic,No,No,No,...,Yes,Mailed check,1,66.13,71.51,2019-10-23 01:09:21,2024-02-25 01:09:21,Yes,0.891,Service issues
9998,109998,Female,0,No,No,Mumbai,Fiber optic,No,No,Yes,...,No,Credit card (automatic),15,81.47,1199.01,2023-03-16 06:10:41,2025-01-17 06:10:41,Yes,0.870,Price too high


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CustomerID        10000 non-null  int64  
 1   Gender            10000 non-null  object 
 2   SeniorCitizen     10000 non-null  int64  
 3   Partner           10000 non-null  object 
 4   Dependents        10000 non-null  object 
 5   Region            10000 non-null  object 
 6   InternetService   10000 non-null  object 
 7   OnlineSecurity    10000 non-null  object 
 8   TechSupport       10000 non-null  object 
 9   OnlineBackup      10000 non-null  object 
 10  StreamingTV       10000 non-null  object 
 11  StreamingMovies   10000 non-null  object 
 12  Contract          10000 non-null  object 
 13  PaperlessBilling  10000 non-null  object 
 14  PaymentMethod     10000 non-null  object 
 15  TenureMonths      10000 non-null  int64  
 16  MonthlyCharges    10000 non-null  float64

In [12]:
# Starting Label encoding for the Object Variable 
from sklearn.preprocessing import LabelEncoder
cat_cols = df.select_dtypes(include='object').columns
label_encode = LabelEncoder()
for cols in cat_cols:
        df[cols] = label_encode.fit_transform(df[cols])

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CustomerID        10000 non-null  int64  
 1   Gender            10000 non-null  int64  
 2   SeniorCitizen     10000 non-null  int64  
 3   Partner           10000 non-null  int64  
 4   Dependents        10000 non-null  int64  
 5   Region            10000 non-null  int64  
 6   InternetService   10000 non-null  int64  
 7   OnlineSecurity    10000 non-null  int64  
 8   TechSupport       10000 non-null  int64  
 9   OnlineBackup      10000 non-null  int64  
 10  StreamingTV       10000 non-null  int64  
 11  StreamingMovies   10000 non-null  int64  
 12  Contract          10000 non-null  int64  
 13  PaperlessBilling  10000 non-null  int64  
 14  PaymentMethod     10000 non-null  int64  
 15  TenureMonths      10000 non-null  int64  
 16  MonthlyCharges    10000 non-null  float64

In [14]:
# Starting the Modelling for The Problem 
# Intial phase Choosing LinearRegression

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

LR = LinearRegression()

X = df.drop(["Churn","ChurnProbability", "ChurnReason"], axis= 1)
y = df["ChurnProbability"]

X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

LR.fit(X_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [17]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error
y_preds = LR.predict(X_test)
print("Mean Absolute Error:", mean_absolute_error(y_test, y_preds))
print("Mean Squared Error:", mean_squared_error(y_test, y_preds))
print("Root Mean Squared Error:", root_mean_squared_error(y_test, y_preds))
print("R^2 Score:", r2_score(y_test, y_preds))

Mean Absolute Error: 0.057056755359813895
Mean Squared Error: 0.0049071483017894026
Root Mean Squared Error: 0.07005104069026671
R^2 Score: 0.8651198442742675


In [None]:
df.to_csv("customer_churn.csv")