# Customer Churn Prediction with Neural Networks

This notebook implements a neural network approach to customer churn prediction. It includes data preprocessing, model training with cross-validation, evaluation, and prediction capabilities.

Author: Lucas Miyazawa (Improved by Claude)

## Setup and Imports

In [1]:
# Standard Libraries
import os
import numpy as np
import pandas as pd
import joblib
import pickle

# Machine Learning Libraries
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, roc_curve
from sklearn.base import BaseEstimator, TransformerMixin

# Deep Learning Libraries
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, regularizers, optimizers, metrics

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

Matplotlib is building the font cache; this may take a moment.


## Data Preprocessing Functions

In [2]:
def split_train_valid(df, target_col='Churn', test_size=0.25, random_state=42):
    """
    Split dataframe into training and validation sets while maintaining class distribution.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The dataframe to split
    target_col : str, default='Churn'
        Name of the target column
    test_size : float, default=0.25
        Proportion of dataset to include in validation split (0.25 = 25%)
    random_state : int, default=42
        Controls the shuffling for reproducible output
        
    Returns:
    --------
    tuple:
        train_df, valid_df - Split dataframes
    """
    # Perform stratified split to maintain class distribution
    train_df, valid_df = train_test_split(
        df, 
        test_size=test_size, 
        random_state=random_state, 
        stratify=df[target_col]
    )
    
    # Print split information
    print(f"Original dataset shape: {df.shape}")
    print(f"Training set shape: {train_df.shape} ({len(train_df)/len(df)*100:.1f}%)")
    print(f"Validation set shape: {valid_df.shape} ({len(valid_df)/len(df)*100:.1f}%)")
    print(f"\nClass distribution in training set:\n{train_df[target_col].value_counts(normalize=True)}")
    print(f"\nClass distribution in validation set:\n{valid_df[target_col].value_counts(normalize=True)}")
    
    return train_df, valid_df

In [3]:
def create_preprocessing_pipeline(df, target_col='Churn', id_cols=None):
    """
    Create a preprocessing pipeline for customer data that handles both
    numerical and categorical features appropriately.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The dataset to preprocess
    target_col : str, default='Churn'
        The name of the target column
    id_cols : list or str, default=None
        Column(s) containing IDs to exclude from preprocessing
        
    Returns:
    --------
    tuple:
        - preprocessor: ColumnTransformer object fit to the data
        - input_dim: Integer representing the dimensionality after preprocessing
    """
    # Initialize default value for id_cols if None
    if id_cols is None:
        id_cols = ['CustomerID']
    
    # Ensure id_cols is a list
    if isinstance(id_cols, str):
        id_cols = [id_cols]

    # Define columns to exclude from processing
    exclude_cols = [target_col] + id_cols if target_col else id_cols
    
    # Separate numerical and categorical columns
    num_columns = []
    cat_columns = []

    for col in df.columns:
        if col in exclude_cols:
            continue
        elif pd.api.types.is_numeric_dtype(df[col]):
            num_columns.append(col)
        else:
            cat_columns.append(col)

    # Build transformation pipelines
    transformers = []
    input_dim = 0

    # Add numerical transformer if numerical columns exist
    if num_columns:
        num_transformer = StandardScaler()
        transformers.append(('num', num_transformer, num_columns))
        input_dim += len(num_columns)

    # Add categorical transformers for each categorical column
    for col in cat_columns:
        transformer = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        transformers.append((f'cat_{col}', transformer, [col]))
        input_dim += df[col].nunique()

    # Create and fit the preprocessor
    preprocessor = ColumnTransformer(transformers=transformers, remainder='drop')
    train_features = df.drop(columns=exclude_cols, errors='ignore').copy()
    preprocessor.fit(train_features)

    return preprocessor, input_dim

In [4]:
def df_to_dataset(dataframe, preprocessor=None, target_col='Churn', id_cols=None, shuffle=True, batch_size=256):
    """
    Convert a pandas DataFrame to a TensorFlow Dataset.
    
    Parameters:
    -----------
    dataframe : pandas.DataFrame
        The dataset to convert
    preprocessor : ColumnTransformer, default=None
        Preprocessor to transform the features
    target_col : str, default='Churn'
        Name of the target column
    id_cols : list or str, default=None
        Column(s) containing IDs to exclude
    shuffle : bool, default=True
        Whether to shuffle the dataset
    batch_size : int, default=256
        Batch size for the dataset
        
    Returns:
    --------
    tf.data.Dataset
        A TensorFlow dataset ready for model training
    """
    # Initialize default value for id_cols if None
    if id_cols is None:
        id_cols = ['CustomerID']
    
    # Ensure id_cols is a list
    if isinstance(id_cols, str):
        id_cols = [id_cols]
    
    # Create a copy to avoid modifying the original dataframe
    df = dataframe.copy()
    
    # Extract labels
    labels = df.pop(target_col)
    
    # Remove ID columns
    for col in id_cols:
        if col in df.columns:
            df.pop(col)
    
    # Apply preprocessing if provided
    if preprocessor is not None:
        features = preprocessor.transform(df).astype(np.float32)
    else:
        features = df.values.astype(np.float32)
    
    # Convert labels to appropriate format
    labels = labels.values.astype(np.int32)
    
    # Create the dataset
    ds = tf.data.Dataset.from_tensor_slices((features, labels))
    
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df))
    
    # Apply batching and prefetching for performance
    return ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

## Model Creation and Training

In [5]:
def create_model(input_dim, dropout_rate=0.7, l2_reg=0.15, learning_rate=0.0001):
    """
    Create a neural network model for churn prediction.
    
    Parameters:
    -----------
    input_dim : int
        Dimensionality of the input features
    dropout_rate : float, default=0.7
        Dropout rate for regularization
    l2_reg : float, default=0.15
        L2 regularization strength
    learning_rate : float, default=0.0001
        Learning rate for the Adam optimizer
        
    Returns:
    --------
    tf.keras.Model
        Compiled neural network model
    """
    model = models.Sequential([
        layers.InputLayer(shape=(input_dim,)),
        layers.Dense(8, activation='tanh', kernel_regularizer=regularizers.l2(l2_reg)),
        layers.Dropout(dropout_rate),
        layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer=optimizers.Adam(learning_rate=learning_rate),
        loss='binary_crossentropy',
        metrics=[metrics.AUC(name='auc')]
    )
    
    return model

In [6]:
def train_model(df, target_col='Churn', id_cols=None, n_splits=4, batch_size=64, 
                epochs=5, verbose=0, model_params=None):
    """
    Train a neural network model using cross-validation.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The dataset for training
    target_col : str, default='Churn'
        Name of the target column
    id_cols : list or str, default=None
        Column(s) containing IDs to exclude
    n_splits : int, default=3
        Number of cross-validation folds
    batch_size : int, default=64
        Batch size for training
    epochs : int, default=100
        Maximum number of training epochs
    verbose : int, default=0
        Verbosity mode (0, 1, or 2)
    model_params : dict, default=None
        Parameters for model creation
        
    Returns:
    --------
    dict
        Dictionary containing the best model, preprocessor, scores, and fold information
    """
    # Initialize default values
    if id_cols is None:
        id_cols = ['CustomerID']
    
    if model_params is None:
        model_params = {
            'dropout_rate': 0.7,
            'l2_reg': 0.15,
            'learning_rate': 0.0001
        }
    
    # Set up cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    best_auc = 0
    best_model = None
    best_preprocessor = None
    best_fold = 0
    auc_scores = []

    # Loop through each fold
    for fold, (train_idx, val_idx) in enumerate(skf.split(df, df[target_col]), 1):
        print(f"\nFold {fold}/{n_splits}")

        # Split data for this fold
        train_df = df.iloc[train_idx]
        val_df = df.iloc[val_idx]

        # Create preprocessing pipeline
        preprocessor, input_dim = create_preprocessing_pipeline(
            train_df,
            target_col=target_col,
            id_cols=id_cols
        )

        # Convert to TensorFlow datasets
        train_ds = df_to_dataset(train_df, preprocessor, target_col, id_cols, shuffle=True, batch_size=batch_size)
        val_ds = df_to_dataset(val_df, preprocessor, target_col, id_cols, shuffle=False, batch_size=batch_size)

        # Create and compile model
        model = create_model(
            input_dim=input_dim,
            dropout_rate=model_params.get('dropout_rate', 0.7),
            l2_reg=model_params.get('l2_reg', 0.15),
            learning_rate=model_params.get('learning_rate', 0.0001)
        )

        # Set up early stopping
        cb_early = callbacks.EarlyStopping(
            monitor='val_auc',
            patience=5,
            mode='max',
            restore_best_weights=True
        )

        # Train the model
        model.fit(
            train_ds, 
            validation_data=val_ds, 
            epochs=epochs, 
            callbacks=[cb_early], 
            verbose=verbose
        )

        # Evaluate the model
        val_metrics = model.evaluate(val_ds, verbose=0)
        auc = val_metrics[1]
        auc_scores.append(auc)

        # Track best model
        if auc > best_auc:
            best_auc = auc
            best_model = model
            best_preprocessor = preprocessor
            best_fold = fold

    # Report results
    print("\nFold results:")
    for i, score in enumerate(auc_scores, 1):
        print(f"Fold {i}: AUC = {score:.4f}")
    print(f"\nAverage AUC: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")
    print(f"Best model was from fold {best_fold} with AUC = {best_auc:.4f}")

    return {
        'model': best_model,
        'preprocessor': best_preprocessor,
        'auc': best_auc,
        'fold': best_fold,
        'all_scores': auc_scores,
        'input_dim': input_dim
    }

## Model Evaluation

In [7]:
def evaluate_model(model, preprocessor, valid_df, target_col='Churn', id_cols=None, optimize_threshold=True):
    """
    Evaluate the model on validation data.
    
    Parameters:
    -----------
    model : tf.keras.Model
        Trained model to evaluate
    preprocessor : ColumnTransformer
        Preprocessor for feature transformation
    valid_df : pandas.DataFrame
        Validation dataset
    target_col : str, default='Churn'
        Name of the target column
    id_cols : list or str, default=None
        Column(s) containing IDs to exclude
    optimize_threshold : bool, default=True
        Whether to optimize the classification threshold
        
    Returns:
    --------
    dict
        Dictionary containing evaluation metrics and threshold
    """
    # Initialize default value for id_cols if None
    if id_cols is None:
        id_cols = ['CustomerID']
        
    # Ensure id_cols is a list
    if isinstance(id_cols, str):
        id_cols = [id_cols]
    
    # Clean data
    df = valid_df.copy()
    df.dropna(inplace=True)
    
    # Prepare features and target
    X_new = df.drop(columns=[target_col] + id_cols, errors='ignore')
    y_true = df[target_col].values
    
    # Apply preprocessing
    X_new_transformed = preprocessor.transform(X_new).astype(np.float32)
    
    # Get predictions
    pred_probs = model.predict(X_new_transformed, verbose=1).flatten()
    
    # Calculate AUC
    auc_score = roc_auc_score(y_true, pred_probs)
    
    # Find optimal threshold if requested
    threshold = 0.5
    if optimize_threshold:
        fpr, tpr, thresholds = roc_curve(y_true, pred_probs)
        # Find threshold that maximizes the difference between TPR and FPR
        threshold = thresholds[np.argmax(tpr - fpr)]
    
    # Apply threshold to get binary predictions
    pred_labels = (pred_probs > threshold).astype(int)
    
    # Calculate confusion matrix
    cm = confusion_matrix(y_true, pred_labels)
    
    # Print results
    print(f"AUC: {auc_score:.4f}")
    print(f"Optimal threshold: {threshold:.4f}")
    print(cm)
    
    return {
        'auc': auc_score,
        'threshold': threshold,
        'confusion_matrix': cm,
        'y_true': y_true,
        'pred_probs': pred_probs,
        'pred_labels': pred_labels
    }

## Prediction and Deployment Functions

In [8]:
def generate_predictions(model, preprocessor, data_df, id_col='CustomerID', 
                         target_col='Churn', threshold=0.5, predictions_dir = 'Predictions', output_file='churn_predictions.csv'):
    """
    Generate and save predictions for a dataset.
    
    Parameters:
    -----------
    model : tf.keras.Model
        Trained model to use for predictions
    preprocessor : ColumnTransformer
        Preprocessor for feature transformation
    data_df : pandas.DataFrame
        Dataset to predict on
    id_col : str, default='CustomerID'
        Name of the ID column
    target_col : str, default='Churn'
        Name of the target column (if exists)
    threshold : float, default=0.5
        Classification threshold
    output_file : str, default='churn_predictions.csv'
        File path to save the predictions
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing predictions
    """
    # Clean data
    df = data_df.copy()
    df.dropna(inplace=True)
    
    # Store IDs
    customer_ids = df[id_col].values
    
    # Prepare features, excluding target if it exists
    columns_to_drop = [id_col]
    if target_col in df.columns:
        columns_to_drop.append(target_col)
    
    X_new = df.drop(columns=columns_to_drop, errors='ignore')
    
    # Apply preprocessing
    X_new_transformed = preprocessor.transform(X_new).astype(np.float32)
    
    # Generate predictions
    pred_probs = model.predict(X_new_transformed, verbose=1).flatten()
    pred_labels = (pred_probs > threshold).astype(int)
    
    # Create output DataFrame
    output_df = pd.DataFrame({
        'customer_id': customer_ids,
        'churn_probability': pred_probs,
        'churn_prediction': pred_labels
    })
    



    # Save to CSV
    os.makedirs(predictions_dir, exist_ok=True)
    predictions_path = os.path.join(predictions_dir, output_file)
    output_df.to_csv(predictions_path, index=False)
    
    print(f"File '{predictions_path}' saved successfully!")
    
    return output_df

In [9]:
def save_model(model, preprocessor, model_dir="Models", model_name="model_churn_tf"):
    """
    Save the model and preprocessor for future use.
    
    Parameters:
    -----------
    model : tf.keras.Model
        Trained model to save
    preprocessor : ColumnTransformer
        Fitted preprocessor to save
    model_dir : str, default="Models"
        Directory to save the model artifacts
    model_name : str, default="model_churn_tf"
        Name for the saved model
        
    Returns:
    --------
    dict
        Dictionary with paths to saved files
    """
    # Create directory if it doesn't exist
    os.makedirs(model_dir , exist_ok=True)
    
    # Save model
    model_path = os.path.join(model_dir, model_name + ".keras")
    model.save(model_path)
    
    # Save preprocessor
    preprocessor_path = os.path.join(model_dir, "preprocessor.pkl")
    with open(preprocessor_path, "wb") as f:
        pickle.dump(preprocessor, f)
    
    print(f"✅ Model saved to '{model_path}'")
    print(f"✅ Preprocessor saved to '{preprocessor_path}'")
    
    return {
        'model_path': model_path,
        'preprocessor_path': preprocessor_path
    }






In [10]:
def load_model(model_dir="Models", model_name="model_churn_tf"):
    """
    Load a saved model and preprocessor.
    
    Parameters:
    -----------
    model_dir : str, default="Models"
        Directory where model artifacts are stored
    model_name : str, default="model_churn_tf"
        Name of the saved model
        
    Returns:
    --------
    tuple
        Loaded model and preprocessor
    """
    # Load model
    model_path = os.path.join(model_dir, model_name)
    model = tf.keras.models.load_model(model_path)
    
    # Load preprocessor
    preprocessor_path = os.path.join(model_dir, "preprocessor.pkl")
    preprocessor = joblib.load(preprocessor_path)
    
    print(f"✅ Model loaded from '{model_path}'")
    print(f"✅ Preprocessor loaded from '{preprocessor_path}'")
    
    return model, preprocessor

## Example Usage

In [11]:
# Example usage
if __name__ == "__main__":
    # Uncomment and run the below code when you have your data ready
    
    print("Loading data...")
    df = pd.read_csv('Dataset/customer_data.csv')  # Replace with your dataset
    df.dropna(inplace=True)
    
    print("Splitting data into training and validation sets...")
    train_df, valid_df = split_train_valid(df, test_size=0.25)
    
    print("Training model...")
    results = train_model(train_df)
    best_model = results['model']
    best_preprocessor = results['preprocessor']
    
    print("Evaluating model on validation set...")
    eval_results = evaluate_model(best_model, best_preprocessor, valid_df)
    
    print("Saving model...")
    save_model(best_model, best_preprocessor)
    
    print("Generating predictions on validation set...")
    pred_df = generate_predictions(best_model, best_preprocessor, valid_df)

Loading data...
Splitting data into training and validation sets...
Original dataset shape: (440832, 12)
Training set shape: (330624, 12) (75.0%)
Validation set shape: (110208, 12) (25.0%)

Class distribution in training set:
Churn
1.0    0.567106
0.0    0.432894
Name: proportion, dtype: float64

Class distribution in validation set:
Churn
1.0    0.567109
0.0    0.432891
Name: proportion, dtype: float64
Training model...

Fold 1/4

Fold 2/4

Fold 3/4

Fold 4/4

Fold results:
Fold 1: AUC = 0.9547
Fold 2: AUC = 0.9551
Fold 3: AUC = 0.9553
Fold 4: AUC = 0.9555

Average AUC: 0.9552 ± 0.0003
Best model was from fold 4 with AUC = 0.9555
Evaluating model on validation set...
[1m3444/3444[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 802us/step
AUC: 0.9548
Optimal threshold: 0.5567
[[43966  3742]
 [ 8843 53657]]
Saving model...
✅ Model saved to 'Models\model_churn_tf.keras'
✅ Preprocessor saved to 'Models\preprocessor.pkl'
Generating predictions on validation set...
[1m3444/3444[0m 