In [39]:
import pandas as pd
import numpy as np
from pathlib import Path

In [40]:
PROJECT_PATH = Path.cwd().parent
DATA_PATH = PROJECT_PATH / "Dataset" / "churn_dataset.csv"
OUTPUT_PATH = PROJECT_PATH / "Dataset" / "cleaned_dataset.csv"

print("Reading dataset from:", DATA_PATH)

Reading dataset from: c:\Users\venut\OneDrive\Desktop\Telecom_Churn\Dataset\churn_dataset.csv


In [41]:
df = pd.read_csv(DATA_PATH)

print("\nFirst 5 rows:")
display(df.head())


First 5 rows:


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [42]:
print("Shape of dataset:", df.shape)

Shape of dataset: (7043, 21)


In [43]:
print("Column names:")
print(df.columns.tolist())

Column names:
['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']


In [44]:
duplicate_count = df.duplicated().sum()
print("Duplicate rows:", duplicate_count)

Duplicate rows: 0


In [45]:
if duplicate_count > 0:
    df = df.drop_duplicates()
    print("Duplicates removed.")

In [46]:
print("Dataset Info:")
df.info()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-n

In [47]:
print("Data Types:")
print(df.dtypes)

Data Types:
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object


In [48]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['SeniorCitizen'] = df['SeniorCitizen'].astype('category')

In [49]:
df.drop('customerID', axis=1, inplace=True)

In [50]:
print("Missing Values:")
print(df.isnull().sum())

Missing Values:
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64


In [51]:
print("Rows where TotalCharges is NaN:")
display(df[df['TotalCharges'].isna()][['tenure', 'MonthlyCharges', 'TotalCharges']])

Rows where TotalCharges is NaN:


Unnamed: 0,tenure,MonthlyCharges,TotalCharges
488,0,52.55,
753,0,20.25,
936,0,80.85,
1082,0,25.75,
1340,0,56.05,
3331,0,19.85,
3826,0,25.35,
4380,0,20.0,
5218,0,19.7,
6670,0,73.35,


In [52]:
df['TotalCharges'] = df['TotalCharges'].fillna(0)

These customers have tenure = 0, meaning they just joined.
So total charges should logically be 0.
Therefore replacing NaN with 0 is business-valid.

In [53]:
print("\nChurn Distribution (Count):")
print(df['Churn'].value_counts())


Churn Distribution (Count):
Churn
No     5174
Yes    1869
Name: count, dtype: int64


In [54]:
print("\nChurn Distribution (Percentage):")
print(df['Churn'].value_counts(normalize=True) * 100)


Churn Distribution (Percentage):
Churn
No     73.463013
Yes    26.536987
Name: proportion, dtype: float64


In [55]:
for col in df.select_dtypes('object').columns:
    print(col, df[col].unique())

gender ['Female' 'Male']
Partner ['Yes' 'No']
Dependents ['No' 'Yes']
PhoneService ['No' 'Yes']
MultipleLines ['No phone service' 'No' 'Yes']
InternetService ['DSL' 'Fiber optic' 'No']
OnlineSecurity ['No' 'Yes' 'No internet service']
OnlineBackup ['Yes' 'No' 'No internet service']
DeviceProtection ['No' 'Yes' 'No internet service']
TechSupport ['No' 'Yes' 'No internet service']
StreamingTV ['No' 'Yes' 'No internet service']
StreamingMovies ['No' 'Yes' 'No internet service']
Contract ['Month-to-month' 'One year' 'Two year']
PaperlessBilling ['Yes' 'No']
PaymentMethod ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
Churn ['No' 'Yes']


In [56]:
print("Statistical Summary:")
display(df.describe())

Statistical Summary:


Unnamed: 0,tenure,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7043.0
mean,32.371149,64.761692,2279.734304
std,24.559481,30.090047,2266.79447
min,0.0,18.25,0.0
25%,9.0,35.5,398.55
50%,29.0,70.35,1394.55
75%,55.0,89.85,3786.6
max,72.0,118.75,8684.8


In [57]:
df.to_csv(OUTPUT_PATH, index=False)
print("Cleaned dataset saved to :", OUTPUT_PATH)

Cleaned dataset saved to : c:\Users\venut\OneDrive\Desktop\Telecom_Churn\Dataset\cleaned_dataset.csv
