In [1]:
# Essential imports for data science and machine learning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# PyTorch for deep learning
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (classification_report, confusion_matrix,
                           roc_auc_score, roc_curve, precision_recall_curve,
                           f1_score, precision_score, recall_score)
from sklearn.utils.class_weight import compute_class_weight

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

# Configure visualization
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12
sns.set_palette("husl")

# Check device availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"🚀 Environment Setup:")
print(f"   Device: {device}")
print(f"   PyTorch Version: {torch.__version__}")
print(f"   Pandas Version: {pd.__version__}")
print(f"   NumPy Version: {np.__version__}")

if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

print("\n✅ All libraries imported successfully!")
print("📊 Ready to begin customer churn analysis...")

🚀 Environment Setup:
   Device: cpu
   PyTorch Version: 2.8.0+cpu
   Pandas Version: 2.3.3
   NumPy Version: 2.3.3

✅ All libraries imported successfully!
📊 Ready to begin customer churn analysis...


In [None]:
# Load the customer churn dataset
print("📥 Loading Cleaned Telco Customer Churn dataset...")

data_paths = [
    '../data/preprocessed/clean-processed-data.csv'
]

df = None
for path in data_paths:
    try:
        df = pd.read_csv(path)
        print(f"✅ Cleaned Data loaded successfully from: {path}")
        break
    except FileNotFoundError:
        continue

# Display basic information about the dataset
print(f"\n📊 Dataset Overview:")
print(f"   Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"   Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Display first few rows
print(f"\n👀 First 5 rows:")
print(df.head())

📥 Loading Cleaned Telco Customer Churn dataset...
✅ Cleaned Data loaded successfully from: ../data/clean-processed-data.csv

📊 Dataset Overview:
   Shape: 7,043 rows × 22 columns
   Memory usage: 6.4 MB

👀 First 5 rows:
   Unnamed: 0  customerID  gender  SeniorCitizen Partner Dependents  tenure  \
0           0  7590-VHVEG  Female              0     Yes         No       1   
1           1  5575-GNVDE    Male              0      No         No      34   
2           2  3668-QPYBK    Male              0      No         No       2   
3           3  7795-CFOCW    Male              0      No         No      45   
4           4  9237-HQITU  Female              0      No         No       2   

  PhoneService MultipleLines InternetService  ... DeviceProtection  \
0           No            No             DSL  ...               No   
1          Yes            No             DSL  ...              Yes   
2          Yes            No             DSL  ...               No   
3           No           

In [3]:
# Feature Engineering and Preprocessing Pipeline
print("🔧 FEATURE ENGINEERING PIPELINE")
print("=" * 60)

# Create a copy for feature engineering
df_features = df.copy()

# 1. CREATE BUSINESS-DRIVEN FEATURES
print("1️⃣ Creating Business-Driven Features...")

# Customer Lifecycle Stage based on tenure
def get_lifecycle_stage(tenure):
    if tenure <= 12:
        return 'New'  # 0-12 months
    elif tenure <= 24:
        return 'Growing'  # 12-24 months
    elif tenure <= 48:
        return 'Mature'  # 24-48 months
    else:
        return 'Loyal'  # 48+ months

df_features['LifecycleStage'] = df_features['tenure'].apply(get_lifecycle_stage)

# Revenue per Month (Customer Value Score)
df_features['RevenuePerMonth'] = df_features['TotalCharges'] / (df_features['tenure'] + 1)

# Service Bundle Score (more services = higher engagement)
service_features = ['PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup',
                   'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
df_features['ServiceBundleScore'] = 0
for feature in service_features:
    df_features['ServiceBundleScore'] += (df_features[feature] == 'Yes').astype(int)

# Risk Indicators based on EDA insights
df_features['HighRiskContract'] = (df_features['Contract'] == 'Month-to-month').astype(int)
df_features['FiberOpticUser'] = (df_features['InternetService'] == 'Fiber optic').astype(int)
df_features['PaperlessHighRisk'] = (df_features['PaperlessBilling'] == 'Yes').astype(int)
df_features['SingleCustomer'] = ((df_features['Partner'] == 'No') &
                                 (df_features['Dependents'] == 'No')).astype(int)

# Price Sensitivity Indicators
df_features['HighMonthlyCharges'] = (df_features['MonthlyCharges'] >
                                    df_features['MonthlyCharges'].quantile(0.75)).astype(int)

# Interaction Features
df_features['TenureChargesRatio'] = df_features['tenure'] / (df_features['MonthlyCharges'] + 1)
df_features['ChargesPerService'] = df_features['MonthlyCharges'] / (df_features['ServiceBundleScore'] + 1)

# Customer Stability Score
df_features['StabilityScore'] = (
    (df_features['Partner'] == 'Yes').astype(int) +
    (df_features['Dependents'] == 'Yes').astype(int) +
    (df_features['Contract'] != 'Month-to-month').astype(int) +
    (df_features['tenure'] > 24).astype(int)
)

print(f"   ✅ Created new features")

🔧 FEATURE ENGINEERING PIPELINE
1️⃣ Creating Business-Driven Features...
   ✅ Created new features


In [4]:
# 2. PREPARE TARGET VARIABLE
print("2️⃣ Preparing Target Variable...")
df_features['target'] = (df_features['Churn'] == 'Yes').astype(int)
print(f"   ✅ Target distribution: {df_features['target'].value_counts().to_dict()}")

2️⃣ Preparing Target Variable...
   ✅ Target distribution: {0: 5174, 1: 1869}


In [5]:
# 3. FEATURE SELECTION AND ENCODING
print("3️⃣ Feature Selection and Encoding...")

# Define feature categories
categorical_features = [
    'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
    'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
    'PaperlessBilling', 'PaymentMethod', 'LifecycleStage'
]

numerical_features = [
    'SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',
    'ServiceBundleScore', 'RevenuePerMonth', 'TenureChargesRatio',
    'ChargesPerService', 'StabilityScore'
]

binary_features = [
    'HighRiskContract', 'FiberOpticUser', 'PaperlessHighRisk',
    'SingleCustomer', 'HighMonthlyCharges'
]

print(f"   • Categorical features: {len(categorical_features)}")
print(f"   • Numerical features: {len(numerical_features)}")
print(f"   • Binary features: {len(binary_features)}")

# One-hot encode categorical features
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Label encode categorical features (for neural networks, we'll use embedding or one-hot)
df_encoded = df_features.copy()

# For simplicity, we'll use Label Encoding for categorical features with few categories
# and One-Hot for those with more categories
label_encoders = {}

for feature in categorical_features:
    if feature in ['InternetService', 'Contract', 'PaymentMethod', 'LifecycleStage']:
        # One-hot encode features with multiple meaningful categories
        dummies = pd.get_dummies(df_encoded[feature], prefix=feature, drop_first=False)
        df_encoded = pd.concat([df_encoded, dummies], axis=1)
        df_encoded = df_encoded.drop(feature, axis=1)
    else:
        # Label encode binary-like categorical features
        le = LabelEncoder()
        df_encoded[feature] = le.fit_transform(df_encoded[feature].astype(str))
        label_encoders[feature] = le

print(f"   ✅ Encoded categorical features")

3️⃣ Feature Selection and Encoding...
   • Categorical features: 16
   • Numerical features: 9
   • Binary features: 5
   ✅ Encoded categorical features


In [6]:
# 4. CREATE FEATURE MATRIX
print("4️⃣ Creating Feature Matrix...")

# Drop non-feature columns
columns_to_drop = ['customerID', 'Churn']
feature_columns = [col for col in df_encoded.columns if col not in columns_to_drop + ['target']]

X = df_encoded[feature_columns]
y = df_encoded['target']

print(f"   ✅ Feature matrix shape: {X.shape}")
print(f"   ✅ Target distribution: {y.value_counts().to_dict()}")

4️⃣ Creating Feature Matrix...
   ✅ Feature matrix shape: (7043, 41)
   ✅ Target distribution: {0: 5174, 1: 1869}


In [16]:
# 5. TRAIN-TEST SPLIT WITH STRATIFICATION
print("5️⃣ Creating Train-Test Split...")

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # Maintain class distribution
)

# Save training and test features and targets to CSV
X_train.to_csv('../data/preprocessed/X_train.csv', index=False)
X_test.to_csv('../data/preprocessed/X_test.csv', index=False)
y_train.to_csv('../data/preprocessed/y_train.csv', index=False)
y_test.to_csv('../data/preprocessed/y_test.csv', index=False)

print("✅ Train/test splits saved as CSV files:")
print("   - X_train.csv, X_test.csv, y_train.csv, y_test.csv")


print(f"   ✅ Training set: {X_train.shape} | Test set: {X_test.shape}")
print(f"   ✅ Train target distribution: {y_train.value_counts().to_dict()}")
print(f"   ✅ Test target distribution: {y_test.value_counts().to_dict()}")

5️⃣ Creating Train-Test Split...
✅ Train/test splits saved as CSV files:
   - X_train.csv, X_test.csv, y_train.csv, y_test.csv
   ✅ Training set: (5634, 41) | Test set: (1409, 41)
   ✅ Train target distribution: {0: 4139, 1: 1495}
   ✅ Test target distribution: {0: 1035, 1: 374}


In [8]:
# 6. FEATURE SCALING
print("6️⃣ Scaling Features...")

# Scale numerical features
scaler = StandardScaler()
numerical_cols = [col for col in feature_columns if col in numerical_features or
                 any(num_feat in col for num_feat in numerical_features)]

# Fit scaler on training data only
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

if numerical_cols:
    X_train_scaled[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
    X_test_scaled[numerical_cols] = scaler.transform(X_test[numerical_cols])
    print(f"   ✅ Scaled {len(numerical_cols)} numerical features")

6️⃣ Scaling Features...
   ✅ Scaled 10 numerical features


In [9]:
# 7. FEATURE IMPORTANCE ANALYSIS
print("7️⃣ Analyzing Feature Importance...")

# Quick feature importance using correlation
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'correlation_with_target': [abs(X_train[col].corr(y_train)) for col in X_train.columns]
}).sort_values('correlation_with_target', ascending=False)

print(f"\n🔝 TOP 10 MOST CORRELATED FEATURES:")
print(feature_importance.head(10).to_string(index=False))

7️⃣ Analyzing Feature Importance...

🔝 TOP 10 MOST CORRELATED FEATURES:
                       feature  correlation_with_target
              HighRiskContract                 0.406401
       Contract_Month-to-month                 0.406401
             ChargesPerService                 0.392478
                StabilityScore                 0.366827
                        tenure                 0.345593
            TenureChargesRatio                 0.335601
            LifecycleStage_New                 0.314332
   InternetService_Fiber optic                 0.312656
                FiberOpticUser                 0.312656
PaymentMethod_Electronic check                 0.309214


In [10]:
# 8. HANDLE CLASS IMBALANCE
print(f"\n8️⃣ Handling Class Imbalance...")

from sklearn.utils.class_weight import compute_class_weight

# Calculate class weights for imbalanced dataset
classes = np.unique(y_train)
class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, class_weights))

print(f"   ✅ Class weights: {class_weight_dict}")
print(f"   • Class 0 (No Churn): {class_weights[0]:.2f}")
print(f"   • Class 1 (Churn): {class_weights[1]:.2f}")

# Convert to PyTorch tensors for training
print(f"\n9️⃣ Converting to PyTorch Tensors...")

# Check data types before conversion
print(f"   • Data types in X_train_scaled: {X_train_scaled.dtypes.value_counts().to_dict()}")

# Convert all columns to float
X_train_numpy = X_train_scaled.astype(float).values
X_test_numpy = X_test_scaled.astype(float).values

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_numpy).to(device)
X_test_tensor = torch.FloatTensor(X_test_numpy).to(device)
y_train_tensor = torch.FloatTensor(y_train.values).to(device)
y_test_tensor = torch.FloatTensor(y_test.values).to(device)

print(f"   ✅ Training tensors: X{X_train_tensor.shape}, y{y_train_tensor.shape}")
print(f"   ✅ Test tensors: X{X_test_tensor.shape}, y{y_test_tensor.shape}")
print(f"   ✅ Device: {device}")

# Confirm tensor data types
print(f"   ✅ X_train_tensor dtype: {X_train_tensor.dtype}")
print(f"   ✅ y_train_tensor dtype: {y_train_tensor.dtype}")


8️⃣ Handling Class Imbalance...
   ✅ Class weights: {np.int64(0): np.float64(0.6805991785455424), np.int64(1): np.float64(1.8842809364548494)}
   • Class 0 (No Churn): 0.68
   • Class 1 (Churn): 1.88

9️⃣ Converting to PyTorch Tensors...
   • Data types in X_train_scaled: {dtype('int64'): 17, dtype('bool'): 14, dtype('float64'): 10}
   ✅ Training tensors: Xtorch.Size([5634, 41]), ytorch.Size([5634])
   ✅ Test tensors: Xtorch.Size([1409, 41]), ytorch.Size([1409])
   ✅ Device: cpu
   ✅ X_train_tensor dtype: torch.float32
   ✅ y_train_tensor dtype: torch.float32


In [12]:
# 10. FEATURE ENGINEERING SUMMARY
print(f"\n📊 FEATURE ENGINEERING SUMMARY:")
print(f"=" * 50)
print(f"🎯 Original features: {df.shape[1]}")
print(f"🔧 Engineered features: {X.shape[1]}")
print(f"📈 Feature increase: +{X.shape[1] - df.shape[1] + 2}")  # +2 for dropped ID and target
print(f"🏗️  Business features created: 11")
print(f"📊 Final dataset: {X.shape[0]:,} samples × {X.shape[1]} features")
print(f"⚖️  Class balance handled: Yes (weighted loss)")
print(f"🎯 Ready for model training!")

# Save preprocessing objects for later use
preprocessing_objects = {
    'scaler': scaler,
    'label_encoders': label_encoders,
    'feature_columns': feature_columns,
    'class_weights': class_weight_dict
}

print(f"\n✅ Feature engineering completed successfully!")
print(f"🚀 Ready to build and train the neural network model!")


📊 FEATURE ENGINEERING SUMMARY:
🎯 Original features: 22
🔧 Engineered features: 41
📈 Feature increase: +21
🏗️  Business features created: 11
📊 Final dataset: 7,043 samples × 41 features
⚖️  Class balance handled: Yes (weighted loss)
🎯 Ready for model training!

✅ Feature engineering completed successfully!
🚀 Ready to build and train the neural network model!


In [13]:
import pickle

# Save preprocessing pipeline and params
with open('preprocessing_pipeline.pkl', 'wb') as f:
    pickle.dump({
        'scaler': scaler,
        'label_encoders': label_encoders,
        'feature_columns': feature_columns,
        'categorical_features': categorical_features,
        'numerical_features': numerical_features,
        'binary_features': binary_features,
        'class_weight_dict': class_weight_dict
    }, f)

# Save best model weights (already done in your code)
# torch.save(model.state_dict(), 'best_churn_model.pth')

In [14]:
import pickle

# Save PyTorch tensors for reproducibility
with open('tensor_data.pkl', 'wb') as f:
    pickle.dump({
        'X_train_tensor': X_train_tensor,
        'X_test_tensor': X_test_tensor,
        'y_train_tensor': y_train_tensor,
        'y_test_tensor': y_test_tensor
    }, f)
