In [2]:
# Cell 1: Setup and load cleaned data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully!")

# Load the cleaned dataset from Notebook 1
try:
    df = pd.read_csv('../data/processed/telco_churn_cleaned.csv')
    print("✅ Cleaned dataset loaded successfully!")
    print(f"📊 Dataset shape: {df.shape}")
    print(f"🎯 Churn distribution: {df['Churn'].value_counts().to_dict()}")
except FileNotFoundError:
    print("❌ Error: Cleaned dataset not found. Please run Notebook 1 first.")

✅ Libraries imported successfully!
✅ Cleaned dataset loaded successfully!
📊 Dataset shape: (7043, 21)
🎯 Churn distribution: {'No': 5174, 'Yes': 1869}


In [3]:
# Cell 2: Feature engineering strategy
print("🛠️ FEATURE ENGINEERING STRATEGY")
print("=" * 50)

print("📋 OUR FEATURE ENGINEERING PLAN:")
print("1. 🔤 Encode categorical variables (One-Hot & Label Encoding)")
print("2. 🆕 Create new engineered features")
print("3. ⚖️  Handle class imbalance with SMOTE")
print("4. 📊 Feature scaling and selection")
print("5. 🎯 Prepare train/test splits")

# Separate features and target
X = df.drop(['customerID', 'Churn'], axis=1)
y = df['Churn']

print(f"\n📊 Feature matrix shape: {X.shape}")
print(f"🎯 Target shape: {y.shape}")

# Identify feature types
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print(f"\n🔤 Categorical features ({len(categorical_cols)}): {categorical_cols}")
print(f"🔢 Numerical features ({len(numerical_cols)}): {numerical_cols}")

# Display unique values for categorical features
print("\n📊 Categorical feature unique values:")
for col in categorical_cols[:5]:  # Show first 5
    print(f"  {col}: {X[col].unique()}")
    

🛠️ FEATURE ENGINEERING STRATEGY
📋 OUR FEATURE ENGINEERING PLAN:
1. 🔤 Encode categorical variables (One-Hot & Label Encoding)
2. 🆕 Create new engineered features
3. ⚖️  Handle class imbalance with SMOTE
4. 📊 Feature scaling and selection
5. 🎯 Prepare train/test splits

📊 Feature matrix shape: (7043, 19)
🎯 Target shape: (7043,)

🔤 Categorical features (15): ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
🔢 Numerical features (4): ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

📊 Categorical feature unique values:
  gender: ['Female' 'Male']
  Partner: ['Yes' 'No']
  Dependents: ['No' 'Yes']
  PhoneService: ['No' 'Yes']
  MultipleLines: ['No phone service' 'No' 'Yes']


In [None]:
# Cell 3: Categorical feature encoding
print("🔤 CATEGORICAL FEATURE ENCODING")
print("=" * 50)

# Create a copy for feature engineering
X_encoded = X.copy()

print("📊 Encoding strategy:")
print("• Binary features: Label Encoding (0/1)")
print("• Multi-class features: One-Hot Encoding")
print("• 'No internet service' treated as separate category")

# Identify binary vs multi-class categorical features
binary_cols = []
multi_class_cols = []

for col in categorical_cols:
    unique_vals = X_encoded[col].nunique()
    if unique_vals == 2:
        binary_cols.append(col)
    else:
        multi_class_cols.append(col)

print(f"\n🔢 Binary features ({len(binary_cols)}): {binary_cols}")
print(f"🎭 Multi-class features ({len(multi_class_cols)}): {multi_class_cols}")

# Label encode binary features
print("\n🔄 Label Encoding binary features...")
label_encoder = LabelEncoder()

for col in binary_cols:
    X_encoded[col] = label_encoder.fit_transform(X_encoded[col])
    print(f"  {col}: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")

# One-hot encode multi-class features
print("\n🎭 One-Hot Encoding multi-class features...")
X_encoded = pd.get_dummies(X_encoded, columns=multi_class_cols, prefix=multi_class_cols)

print(f"\n✅ Encoding completed!")
print(f"📊 New feature matrix shape: {X_encoded.shape}")
print(f"🔤 Total features after encoding: {X_encoded.shape[1]}")

# Display new feature names
print(f"\n📋 First 10 new feature names:")
print(X_encoded.columns[:10].tolist())