In [2]:
import pandas as pd
import numpy as np

# 1. Load Data
df = pd.read_csv('../data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# 2. The Fix (TotalCharges)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce').fillna(0)

# 3. Drop 'customerID'
if 'customerID' in df.columns:
    df.drop('customerID', axis=1, inplace=True)
    print("Dropped 'customerID' column.")

# 4. Check for Duplicates
duplicates = df.duplicated().sum()
print(f"Duplicates found: {duplicates}")

# Remove duplicates if any
if duplicates > 0:
    df.drop_duplicates(inplace=True)
    print("Duplicates removed.")

# Verify shape
print(f"Final Data Shape: {df.shape}")

Dropped 'customerID' column.
Duplicates found: 22
Duplicates removed.
Final Data Shape: (7021, 20)


## Feature Encoding

In [3]:
# 1. Label Encoding (Binary Variables)
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']

# Manually map them to keep control (Best Practice)
for col in binary_cols:
    df[col] = df[col].apply(lambda x: 1 if x == 'Yes' else 0)

# Gender is Male/Female, so we map it separately
df['gender'] = df['gender'].apply(lambda x: 1 if x == 'Female' else 0)

print("Binary encoding complete.")

# 2. One-Hot Encoding (Multi-class Variables)
# Columns like 'InternetService' have 3+ options. We turn them into separate columns.
df = pd.get_dummies(df, drop_first=True)

print("One-Hot encoding complete.")
print(f"New Data Shape: {df.shape}")

display(df.head())

Binary encoding complete.
One-Hot encoding complete.
New Data Shape: (7021, 31)


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,0,1,0,1,0,1,29.85,29.85,0,...,False,False,False,False,False,False,False,False,True,False
1,0,0,0,0,34,1,0,56.95,1889.5,0,...,False,False,False,False,False,True,False,False,False,True
2,0,0,0,0,2,1,1,53.85,108.15,1,...,False,False,False,False,False,False,False,False,False,True
3,0,0,0,0,45,0,0,42.3,1840.75,0,...,True,False,False,False,False,True,False,False,False,False
4,1,0,0,0,2,1,1,70.7,151.65,1,...,False,False,False,False,False,False,False,False,True,False


## Feature Scaling
We will scale numerical columns (`tenure`, `MonthlyCharges`, `TotalCharges`) to a range of 0 to 1. This prevents variables with large magnitudes from dominating the Machine Learning model.

In [5]:
from sklearn.preprocessing import MinMaxScaler

# 1. Identify the columns to scale
cols_to_scale = ['tenure', 'MonthlyCharges', 'TotalCharges']

# 2. Initialize the Scaler
scaler = MinMaxScaler()

# 3. Apply the scaler
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

# 4. Check the results
print("Scaling complete. Checking first 5 rows:")
display(df[cols_to_scale].head())

# --- SAVE THE PROCESSED FILE ---
df.to_csv('../data/telco_churn_processed.csv', index=False)
print("\n✅ Processed dataset saved as 'telco_churn_processed.csv'")

Scaling complete. Checking first 5 rows:


Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,0.013889,0.115423,0.003437
1,0.472222,0.385075,0.217564
2,0.027778,0.354229,0.012453
3,0.625,0.239303,0.211951
4,0.027778,0.521891,0.017462



✅ Processed dataset saved as 'telco_churn_processed.csv'
