# Notebook 2: Data Preprocessing & Feature Engineering
Clean data and create features for modeling

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import joblib
import warnings
warnings.filterwarnings('ignore')



In [19]:
# %%
# Load raw data
df = pd.read_csv('../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(f"Original data shape: {df.shape}")
df.head()


Original data shape: (7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [20]:
# Step 1: Handle missing values
print("\n1. HANDLING MISSING VALUES")
print("="*60)

# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check missing values
print(f"Missing values in TotalCharges: {df['TotalCharges'].isna().sum()}")

# Fill missing values with median
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

print(f"After handling: {df['TotalCharges'].isna().sum()} missing values")


1. HANDLING MISSING VALUES
Missing values in TotalCharges: 11
After handling: 0 missing values


In [21]:
# Step 2: Convert target variable
print("\n2. CONVERTING TARGET VARIABLE")
print("="*60)

df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
print("Churn distribution:")
print(df['Churn'].value_counts())


2. CONVERTING TARGET VARIABLE
Churn distribution:
Churn
0    5174
1    1869
Name: count, dtype: int64


In [22]:
# Step 3: Feature Engineering
print("\n3. FEATURE ENGINEERING")
print("="*60)

# Create tenure groups
df['TenureGroup'] = pd.cut(
    df['tenure'],
    bins=[0, 12, 24, 48, 72],
    labels=[0, 1, 2, 3],
    include_lowest=True
).astype(int)


df['TenureGroup'] = df['TenureGroup'].astype(int)

# Charges per month
df['ChargesPerMonth'] = df['TotalCharges'] / (df['tenure'] + 1)

# Total services count
service_cols = ['PhoneService', 'MultipleLines', 'InternetService', 
                'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                'TechSupport', 'StreamingTV', 'StreamingMovies']
df['TotalServices'] = (df[service_cols] != 'No').sum(axis=1)

print("New features created:")
print("  ✓ TenureGroup")
print("  ✓ ChargesPerMonth")
print("  ✓ TotalServices")



3. FEATURE ENGINEERING
New features created:
  ✓ TenureGroup
  ✓ ChargesPerMonth
  ✓ TotalServices


In [23]:
# Step 4: Remove unnecessary columns
print("\n4. REMOVING UNNECESSARY COLUMNS")
print("="*60)

df_processed = df.drop('customerID', axis=1)
print(f"Dropped: customerID")
print(f"New shape: {df_processed.shape}")


4. REMOVING UNNECESSARY COLUMNS
Dropped: customerID
New shape: (7043, 23)


In [24]:
# Step 5: Encode categorical variables
print("\n5. ENCODING CATEGORICAL VARIABLES")
print("="*60)

# Get categorical columns
categorical_cols = df_processed.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns: {len(categorical_cols)}")


5. ENCODING CATEGORICAL VARIABLES
Categorical columns: 15


In [25]:
# Encode each column
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_processed[col] = le.fit_transform(df_processed[col].astype(str))
    label_encoders[col] = le
    print(f"  ✓ Encoded: {col}")

  ✓ Encoded: gender
  ✓ Encoded: Partner
  ✓ Encoded: Dependents
  ✓ Encoded: PhoneService
  ✓ Encoded: MultipleLines
  ✓ Encoded: InternetService
  ✓ Encoded: OnlineSecurity
  ✓ Encoded: OnlineBackup
  ✓ Encoded: DeviceProtection
  ✓ Encoded: TechSupport
  ✓ Encoded: StreamingTV
  ✓ Encoded: StreamingMovies
  ✓ Encoded: Contract
  ✓ Encoded: PaperlessBilling
  ✓ Encoded: PaymentMethod


In [26]:
# Step 6: Prepare features and target
print("\n6. PREPARING FEATURES AND TARGET")
print("="*60)

X = df_processed.drop('Churn', axis=1)
y = df_processed['Churn']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature names:")
print(X.columns.tolist())



6. PREPARING FEATURES AND TARGET
Features shape: (7043, 22)
Target shape: (7043,)

Feature names:
['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'TenureGroup', 'ChargesPerMonth', 'TotalServices']


In [27]:
# Step 7: Train-test split
print("\n7. TRAIN-TEST SPLIT")
print("="*60)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Train churn rate: {y_train.mean()*100:.1f}%")
print(f"Test churn rate: {y_test.mean()*100:.1f}%")



7. TRAIN-TEST SPLIT
Training set: 5634 samples
Test set: 1409 samples
Train churn rate: 26.5%
Test churn rate: 26.5%


In [28]:
# Step 8: Feature scaling
print("\n8. FEATURE SCALING")
print("="*60)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✓ Features scaled using StandardScaler")
print(f"Scaled training data shape: {X_train_scaled.shape}")



8. FEATURE SCALING
✓ Features scaled using StandardScaler
Scaled training data shape: (5634, 22)


In [30]:
# Step 9: Save processed data and artifacts
print("\n9. SAVING ARTIFACTS")
print("="*60)

# Save processed dataframe
df_processed.to_csv('../data/processed/processed_churn_data.csv', index=False)
print("✓ Processed data saved")


9. SAVING ARTIFACTS
✓ Processed data saved


In [31]:
# Save train-test split
np.save('../data/processed/X_train.npy', X_train_scaled)
np.save('../data/processed/X_test.npy', X_test_scaled)
np.save('../data/processed/y_train.npy', y_train)
np.save('../data/processed/y_test.npy', y_test)
print("✓ Train-test data saved")

✓ Train-test data saved


In [32]:
# Save scaler and encoders
joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(label_encoders, '../models/label_encoders.pkl')
joblib.dump(X.columns.tolist(), '../models/feature_names.pkl')
print("✓ Scaler and encoders saved")

✓ Scaler and encoders saved


In [33]:
# Summary
print("\n" + "="*60)
print("PREPROCESSING COMPLETE!")
print("="*60)

print("\n📊 Summary:")
print(f"  • Original features: {df.shape[1]}")
print(f"  • Processed features: {X.shape[1]}")
print(f"  • New features created: 3")
print(f"  • Training samples: {X_train.shape[0]}")
print(f"  • Test samples: {X_test.shape[0]}")

print("\n📁 Files saved:")
print("  ✓ ../data/processed/processed_churn_data.csv")
print("  ✓ ../data/processed/X_train.npy")
print("  ✓ ../data/processed/X_test.npy")
print("  ✓ ../data/processed/y_train.npy")
print("  ✓ ../data/processed/y_test.npy")
print("  ✓ ../models/scaler.pkl")
print("  ✓ ../models/label_encoders.pkl")
print("  ✓ ../models/feature_names.pkl")

print("\n✓ Ready for model training!")



PREPROCESSING COMPLETE!

📊 Summary:
  • Original features: 24
  • Processed features: 22
  • New features created: 3
  • Training samples: 5634
  • Test samples: 1409

📁 Files saved:
  ✓ ../data/processed/processed_churn_data.csv
  ✓ ../data/processed/X_train.npy
  ✓ ../data/processed/X_test.npy
  ✓ ../data/processed/y_train.npy
  ✓ ../data/processed/y_test.npy
  ✓ ../models/scaler.pkl
  ✓ ../models/label_encoders.pkl
  ✓ ../models/feature_names.pkl

✓ Ready for model training!
