In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.dropna(subset=['TotalCharges'], inplace=True)
print("Data loaded and cleaned for engineering.")

Data loaded and cleaned for engineering.


In [2]:
# Drop customerID as it's not a feature for prediction
df.drop('customerID', axis=1, inplace=True)
print("Dropped customerID.")

Dropped customerID.


In [3]:
# Convert Churn to 1 and 0
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
print("Churn encoded: 1 for Yes, 0 for No.")
df['Churn'].head()

Churn encoded: 1 for Yes, 0 for No.


0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int64

In [5]:
# Identify categorical columns
# We exclude 'Churn' because we already handled it
cat_cols = df.select_dtypes(include=['str']).columns

# Use pd.get_dummies to convert text columns into multiple 0/1 columns
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

print(f"Original columns: {df.shape[1]}")
print(f"Encoded columns: {df_encoded.shape[1]}")
df_encoded.head()

Original columns: 20
Encoded columns: 31


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,False,True,False,False,True,...,False,False,False,False,False,False,True,False,True,False
1,0,34,56.95,1889.5,0,True,False,False,True,False,...,False,False,False,False,True,False,False,False,False,True
2,0,2,53.85,108.15,1,True,False,False,True,False,...,False,False,False,False,False,False,True,False,False,True
3,0,45,42.3,1840.75,0,True,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False
4,0,2,70.7,151.65,1,False,False,False,True,False,...,False,False,False,False,False,False,True,False,True,False


In [6]:
# Save the processed data
df_encoded.to_csv('../data/processed/processed_data.csv', index=False)

print("Processed data saved to data/processed/processed_churn_data.csv")

Processed data saved to data/processed/processed_churn_data.csv
