In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np


In [8]:
# Read the CSV file
file_path = "Telco-Customer-Churn.csv"
df = pd.read_csv(file_path)


In [9]:
# Display the first 5 rows of the dataset
print("Original dataset:")
print(df.head())


Original dataset:
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies   

In [10]:
# Remove unnecessary columns
df = df.drop(['customerID'], axis=1)


In [11]:
# Check for missing values in the dataset
print("\nMissing values:")
print(df.isnull().sum())



Missing values:
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [12]:
# Handle missing values
# For this dataset, missing values are represented as empty strings in the 'TotalCharges' column.
# First, we'll convert the empty strings to NaN and then handle the missing values.
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')


In [15]:
# Replace missing values with the median value of the 'TotalCharges' column
total_charges_median = df['TotalCharges'].median()
df['TotalCharges'].fillna(total_charges_median, inplace=True)


In [14]:
# Verify there are no more missing values
print("\nMissing values after handling:")
print(df.isnull().sum())



Missing values after handling:
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [16]:
# Display the preprocessed dataset
print("\nPreprocessed dataset:")
print(df.head())



Preprocessed dataset:
   gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  Female              0     Yes         No       1           No   
1    Male              0      No         No      34          Yes   
2    Male              0      No         No       2          Yes   
3    Male              0      No         No      45           No   
4  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity OnlineBackup  \
0  No phone service             DSL             No          Yes   
1                No             DSL            Yes           No   
2                No             DSL            Yes          Yes   
3  No phone service             DSL            Yes           No   
4                No     Fiber optic             No           No   

  DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
0               No          No          No              No  Month-to-month   
1        

In [17]:
# Save the preprocessed dataset as a new CSV file
df.to_csv('preprocessed_telco_customer_churn.csv', index=False)
