#  02 - Feature Engineering
This notebook handles data cleaning, transformation, and preparation for model training.

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load dataset
file_path = '../data/Telco-Customer-Churn.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


##  Handle Missing and Invalid Data

In [4]:
# Convert TotalCharges to numeric and fill missing values
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

##  Encode Binary Categorical Variables

In [5]:
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

##  One-Hot Encode Multi-Class Categorical Variables

In [6]:
# Drop customerID and encode categorical columns
df.drop('customerID', axis=1, inplace=True)
df = pd.get_dummies(df, drop_first=True)

##  Feature Scaling

In [7]:
scaler = StandardScaler()
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

##  Split into Features and Target

In [8]:
X = df.drop('Churn', axis=1)
y = df['Churn']

# Save for later (optional)
X.to_csv('../data/features.csv', index=False)
y.to_csv('../data/labels.csv', index=False)

 Now the data is ready for model training!