In [3]:
import pandas as pd

df = pd.read_csv(
    r"C:\Users\Neeraj Nair R\Downloads\WA_Fn-UseC_-Telco-Customer-Churn.csv"
)


In [None]:
### Data Inspection
#The dataset is inspected to understand its structure, data types, and initial data quality issues.

In [4]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

In [None]:
### Handling Missing Values
#The `TotalCharges` column contained blank string values that were not detected as NaN.  
#These values were converted to NaN and imputed using the median because the median is robust to outliers.

In [5]:
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
## IQR method is used to cap extreme values while preserving all observations

    df[col] = df[col].clip(Q1 - 1.5*IQR, Q3 + 1.5*IQR)

In [None]:
## Outlier TreatmentOutliers were detected using the Interquartile Range (IQR) method.  
#Extreme values were capped instead of removed to preserve valid customer observations.

In [6]:
df.drop('customerID', axis=1, inplace=True)

In [None]:
#df.drop('customerID', axis=1, inplace=True)

In [7]:
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

In [8]:
df = pd.get_dummies(df, drop_first=True)

In [None]:
## Encoding Categorical Variables
#Label encoding was applied to the binary target variable (`Churn`).  
#One-hot encoding was used for categorical features to avoid introducing artificial ordinal relationships.

In [None]:
## Feature Scaling
#Feature scaling was not applied during data cleaning.  
#It should be performed only after train-test split to prevent data leakage.

### Data Leakage Considerations
#Features such as `TotalCharges` may indirectly leak information about churn, as customers who leave early tend to have lower total charges.  
#Such features must be handled carefully during model training.

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 31 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          7043 non-null   int64  
 1   tenure                                 7043 non-null   int64  
 2   MonthlyCharges                         7043 non-null   float64
 3   TotalCharges                           7043 non-null   float64
 4   Churn                                  7043 non-null   int64  
 5   gender_Male                            7043 non-null   bool   
 6   Partner_Yes                            7043 non-null   bool   
 7   Dependents_Yes                         7043 non-null   bool   
 8   PhoneService_Yes                       7043 non-null   bool   
 9   MultipleLines_No phone service         7043 non-null   bool   
 10  MultipleLines_Yes                      7043 non-null   bool   
 11  Inte

In [11]:
df.to_csv(
    r"C:\Users\Neeraj Nair R\Downloads\cleaned_telco_churn.csv",
    index=False
)
