<a href="https://colab.research.google.com/github/Belal-dev112/COUSTMER_BEHAVIOUR/blob/main/Coustmer_Behaviour.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime, timedelta
from typing import Dict, List, Tuple, Any
import json
import pickle

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    roc_curve, precision_recall_curve
)
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA

# Imbalanced Learning - Manual implementation
XGBOOST_AVAILABLE = False
SHAP_AVAILABLE = False

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

class SimpleSMOTE:
    """Simple SMOTE implementation without external dependencies."""

    def __init__(self, k_neighbors=5, random_state=42):
        self.k_neighbors = k_neighbors
        self.random_state = random_state

    def fit_resample(self, X, y):
        """Oversample minority class using SMOTE."""
        np.random.seed(self.random_state)

        # Find minority and majority classes
        unique, counts = np.unique(y, return_counts=True)
        minority_class = unique[np.argmin(counts)]
        majority_class = unique[np.argmax(counts)]

        minority_samples = X[y == minority_class]
        minority_labels = y[y == minority_class]
        majority_samples = X[y == majority_class]
        majority_labels = y[y == majority_class]

        # Number of synthetic samples to generate
        n_synthetic = len(majority_samples) - len(minority_samples)

        if n_synthetic <= 0:
            return X, y

        # Generate synthetic samples
        synthetic_samples = []

        for _ in range(n_synthetic):
            # Random sample from minority class
            idx = np.random.randint(0, len(minority_samples))
            sample = minority_samples[idx]

            # Find k nearest neighbors
            distances = np.sum((minority_samples - sample) ** 2, axis=1)
            k_nearest_idx = np.argsort(distances)[1:self.k_neighbors+1]

            # Choose random neighbor
            neighbor_idx = np.random.choice(k_nearest_idx)
            neighbor = minority_samples[neighbor_idx]

            # Generate synthetic sample
            alpha = np.random.random()
            synthetic_sample = sample + alpha * (neighbor - sample)
            synthetic_samples.append(synthetic_sample)

        # Combine all samples
        X_resampled = np.vstack([
            majority_samples,
            minority_samples,
            np.array(synthetic_samples)
        ])

        y_resampled = np.hstack([
            majority_labels,
            minority_labels,
            np.full(n_synthetic, minority_class)
        ])

        # Shuffle
        shuffle_idx = np.random.permutation(len(X_resampled))
        X_resampled = X_resampled[shuffle_idx]
        y_resampled = y_resampled[shuffle_idx]

        return X_resampled, y_resampled


class CustomerDataGenerator:
    """Generate synthetic but realistic customer data for churn prediction."""

    def __init__(self, n_samples=10000, random_state=42):
        self.n_samples = n_samples
        self.random_state = random_state
        np.random.seed(random_state)

    def generate_data(self) -> pd.DataFrame:
        """Generate comprehensive customer dataset."""

        # Demographics
        customer_ids = [f"CUST_{i:06d}" for i in range(self.n_samples)]
        age = np.random.normal(35, 12, self.n_samples).clip(18, 75).astype(int)
        gender = np.random.choice(['M', 'F', 'Other'], self.n_samples, p=[0.48, 0.48, 0.04])
        location = np.random.choice(['Urban', 'Suburban', 'Rural'], self.n_samples, p=[0.5, 0.35, 0.15])

        # Service Information
        tenure_months = np.random.exponential(24, self.n_samples).clip(0, 72).astype(int)
        contract_type = np.random.choice(['Month-to-Month', '1-Year', '2-Year'], self.n_samples, p=[0.5, 0.3, 0.2])
        monthly_charges = np.random.normal(65, 30, self.n_samples).clip(20, 150)

        # Usage Behavior
        login_frequency = np.random.poisson(15, self.n_samples).clip(0, 100)
        feature_usage_score = np.random.beta(2, 5, self.n_samples) * 100
        avg_session_duration = np.random.gamma(2, 15, self.n_samples).clip(5, 180)

        # Engagement Metrics
        support_tickets = np.random.poisson(2, self.n_samples)
        complaints = np.random.poisson(0.5, self.n_samples)
        days_since_last_login = np.random.exponential(5, self.n_samples).clip(0, 60).astype(int)

        # Billing Information
        payment_delay_days = np.random.exponential(2, self.n_samples).clip(0, 30).astype(int)
        payment_method = np.random.choice(['Credit Card', 'Bank Transfer', 'Digital Wallet'],
                                         self.n_samples, p=[0.5, 0.3, 0.2])
        total_charges = monthly_charges * tenure_months + np.random.normal(0, 50, self.n_samples)

        # Additional Features
        num_referrals = np.random.poisson(1, self.n_samples)
        promotional_offers_used = np.random.poisson(2, self.n_samples)
        download_volume_gb = np.random.lognormal(3, 1.5, self.n_samples).clip(0, 500)

        # Churn Logic (Creating realistic churn patterns)
        churn_probability = self._calculate_churn_probability(
            tenure_months, contract_type, support_tickets, complaints,
            payment_delay_days, login_frequency, days_since_last_login,
            monthly_charges, feature_usage_score
        )

        churn = (np.random.random(self.n_samples) < churn_probability).astype(int)

        # Create DataFrame
        df = pd.DataFrame({
            'customer_id': customer_ids,
            'age': age,
            'gender': gender,
            'location': location,
            'tenure_months': tenure_months,
            'contract_type': contract_type,
            'monthly_charges': monthly_charges,
            'total_charges': total_charges,
            'login_frequency': login_frequency,
            'feature_usage_score': feature_usage_score,
            'avg_session_duration': avg_session_duration,
            'support_tickets': support_tickets,
            'complaints': complaints,
            'days_since_last_login': days_since_last_login,
            'payment_delay_days': payment_delay_days,
            'payment_method': payment_method,
            'num_referrals': num_referrals,
            'promotional_offers_used': promotional_offers_used,
            'download_volume_gb': download_volume_gb,
            'churn': churn
        })

        # Introduce missing values and outliers
        df = self._introduce_data_issues(df)

        return df

    def _calculate_churn_probability(self, tenure, contract, tickets, complaints,
                                    payment_delay, login_freq, last_login,
                                    charges, feature_usage):
        """Calculate realistic churn probability based on customer attributes."""

        prob = np.zeros(len(tenure))

        # Tenure effect (newer customers more likely to churn)
        prob += 0.3 * np.exp(-tenure / 12)

        # Contract type effect
        contract_risk = np.where(contract == 'Month-to-Month', 0.25,
                        np.where(contract == '1-Year', 0.1, 0.05))
        prob += contract_risk

        # Support issues
        prob += 0.05 * tickets + 0.1 * complaints

        # Payment issues
        prob += 0.02 * payment_delay

        # Engagement (low engagement = high churn)
        prob += 0.01 * (30 - login_freq).clip(0, 30)
        prob += 0.01 * last_login

        # Feature usage (low usage = high churn)
        prob += 0.002 * (100 - feature_usage)

        # Price sensitivity
        prob += 0.001 * (charges - 50).clip(0, 100)

        return prob.clip(0, 0.9)

    def _introduce_data_issues(self, df: pd.DataFrame) -> pd.DataFrame:
        """Introduce realistic data quality issues."""

        # Missing values (5-10% for some columns)
        for col in ['age', 'feature_usage_score', 'total_charges', 'download_volume_gb']:
            mask = np.random.random(len(df)) < 0.07
            df.loc[mask, col] = np.nan

        # Outliers
        outlier_mask = np.random.random(len(df)) < 0.02
        df.loc[outlier_mask, 'monthly_charges'] *= np.random.uniform(2, 5, outlier_mask.sum())

        # Inconsistent entries
        inconsistent_mask = np.random.random(len(df)) < 0.01
        df.loc[inconsistent_mask, 'gender'] = np.random.choice(['M', 'F', 'Other', 'Unknown', 'male', 'female'],
                                                                inconsistent_mask.sum())

        return df


class DataPreprocessor:
    """Handle data cleaning and feature engineering."""

    def __init__(self):
        self.scaler = StandardScaler()
        self.label_encoders = {}
        self.feature_names = None

    def clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Comprehensive data cleaning."""

        df = df.copy()
        print("\n" + "="*70)
        print("DATA CLEANING")
        print("="*70)

        # Fix inconsistent gender values
        gender_mapping = {'male': 'M', 'female': 'F', 'Unknown': 'Other'}
        df['gender'] = df['gender'].replace(gender_mapping)

        # Handle missing values
        print("\nMissing Values Before Cleaning:")
        print(df.isnull().sum()[df.isnull().sum() > 0])

        # Numerical columns - fill with median
        num_cols = df.select_dtypes(include=[np.number]).columns
        for col in num_cols:
            if df[col].isnull().sum() > 0:
                df[col].fillna(df[col].median(), inplace=True)

        # Categorical columns - fill with mode
        cat_cols = df.select_dtypes(include=['object']).columns
        cat_cols = cat_cols.drop('customer_id')
        for col in cat_cols:
            if df[col].isnull().sum() > 0:
                df[col].fillna(df[col].mode()[0], inplace=True)

        # Handle outliers using IQR method
        for col in ['monthly_charges', 'total_charges', 'download_volume_gb']:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 3 * IQR
            upper_bound = Q3 + 3 * IQR

            outliers_before = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
            df[col] = df[col].clip(lower_bound, upper_bound)
            print(f"\n{col}: Capped {outliers_before} outliers")

        print("\nMissing Values After Cleaning:")
        print(df.isnull().sum().sum(), "total missing values")

        return df

    def engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create advanced engineered features."""

        df = df.copy()
        print("\n" + "="*70)
        print("FEATURE ENGINEERING")
        print("="*70)

        # Customer Lifetime Value (CLV)
        df['customer_lifetime_value'] = df['monthly_charges'] * df['tenure_months']

        # Average Revenue Per Month
        df['avg_revenue_per_month'] = df['total_charges'] / (df['tenure_months'] + 1)

        # Engagement Score (composite metric)
        df['engagement_score'] = (
            (df['login_frequency'] / df['login_frequency'].max()) * 0.4 +
            (df['feature_usage_score'] / 100) * 0.3 +
            (df['avg_session_duration'] / df['avg_session_duration'].max()) * 0.3
        ) * 100

        # Engagement Decay Rate
        df['engagement_decay'] = df['days_since_last_login'] / (df['tenure_months'] + 1)

        # Support Issue Intensity
        df['support_intensity'] = (df['support_tickets'] + 2 * df['complaints']) / (df['tenure_months'] + 1)

        # Payment Reliability Score
        df['payment_reliability'] = 100 - (df['payment_delay_days'] * 2).clip(0, 100)

        # Value-to-Cost Ratio
        df['value_cost_ratio'] = df['feature_usage_score'] / (df['monthly_charges'] + 1)

        # Tenure Categories
        df['tenure_category'] = pd.cut(df['tenure_months'],
                                       bins=[0, 6, 12, 24, 72],
                                       labels=['New', 'Growing', 'Established', 'Loyal'])

        # High-Risk Indicators
        df['is_high_complaints'] = (df['complaints'] > df['complaints'].quantile(0.75)).astype(int)
        df['is_payment_delayed'] = (df['payment_delay_days'] > 5).astype(int)
        df['is_low_engagement'] = (df['engagement_score'] < df['engagement_score'].quantile(0.25)).astype(int)

        # Referral Activity
        df['is_active_referrer'] = (df['num_referrals'] > 0).astype(int)

        # Usage Intensity
        df['usage_per_dollar'] = df['download_volume_gb'] / (df['monthly_charges'] + 1)

        # Recency Score
        df['recency_score'] = 100 - (df['days_since_last_login'] * 2).clip(0, 100)

        print(f"\nCreated {len([c for c in df.columns if c not in ['customer_id', 'churn']])} features")
        print(f"New engineered features: {len(df.columns) - 20}")

        return df

    def encode_categorical(self, df: pd.DataFrame, fit: bool = True) -> pd.DataFrame:
        """Encode categorical variables."""

        df = df.copy()
        categorical_cols = ['gender', 'location', 'contract_type', 'payment_method', 'tenure_category']

        for col in categorical_cols:
            if col in df.columns:
                if fit:
                    le = LabelEncoder()
                    df[col] = le.fit_transform(df[col].astype(str))
                    self.label_encoders[col] = le
                else:
                    if col in self.label_encoders:
                        # Handle unseen labels
                        le = self.label_encoders[col]
                        df[col] = df[col].astype(str).apply(
                            lambda x: le.transform([x])[0] if x in le.classes_ else -1
                        )

        return df

    def prepare_features(self, df: pd.DataFrame, fit: bool = True) -> Tuple[np.ndarray, np.ndarray]:
        """Prepare features for modeling."""

        df = df.copy()

        # Separate features and target
        feature_cols = [col for col in df.columns if col not in ['customer_id', 'churn']]
        X = df[feature_cols].values
        y = df['churn'].values if 'churn' in df.columns else None

        if fit:
            self.feature_names = feature_cols
            X = self.scaler.fit_transform(X)
        else:
            X = self.scaler.transform(X)

        return X, y


class ExploratoryAnalysis:
    """Perform comprehensive EDA."""

    @staticmethod
    def generate_statistical_summary(df: pd.DataFrame) -> pd.DataFrame:
        """Generate detailed statistical summary."""

        print("\n" + "="*70)
        print("STATISTICAL SUMMARY")
        print("="*70)

        summary = df.describe(include='all').T
        summary['missing'] = df.isnull().sum()
        summary['missing_pct'] = (df.isnull().sum() / len(df)) * 100

        print("\nDataset Shape:", df.shape)
        print("\nChurn Distribution:")
        print(df['churn'].value_counts())
        print(f"\nChurn Rate: {df['churn'].mean() * 100:.2f}%")

        return summary

    @staticmethod
    def plot_churn_analysis(df: pd.DataFrame, save_path: str = None):
        """Create comprehensive churn visualizations."""

        fig = plt.figure(figsize=(20, 12))

        # 1. Churn Distribution
        ax1 = plt.subplot(3, 4, 1)
        churn_counts = df['churn'].value_counts()
        ax1.pie(churn_counts, labels=['Retained', 'Churned'], autopct='%1.1f%%', startangle=90)
        ax1.set_title('Churn Distribution', fontsize=12, fontweight='bold')

        # 2. Churn by Contract Type
        ax2 = plt.subplot(3, 4, 2)
        pd.crosstab(df['contract_type'], df['churn'], normalize='index').plot(kind='bar', ax=ax2)
        ax2.set_title('Churn Rate by Contract Type', fontsize=10)
        ax2.set_xlabel('Contract Type')
        ax2.set_ylabel('Proportion')
        ax2.legend(['Retained', 'Churned'])
        plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45, ha='right')

        # 3. Tenure Distribution
        ax3 = plt.subplot(3, 4, 3)
        df[df['churn']==0]['tenure_months'].hist(bins=30, alpha=0.5, label='Retained', ax=ax3)
        df[df['churn']==1]['tenure_months'].hist(bins=30, alpha=0.5, label='Churned', ax=ax3)
        ax3.set_title('Tenure Distribution', fontsize=10)
        ax3.set_xlabel('Tenure (months)')
        ax3.legend()

        # 4. Monthly Charges
        ax4 = plt.subplot(3, 4, 4)
        df[df['churn']==0]['monthly_charges'].hist(bins=30, alpha=0.5, label='Retained', ax=ax4)
        df[df['churn']==1]['monthly_charges'].hist(bins=30, alpha=0.5, label='Churned', ax=ax4)
        ax4.set_title('Monthly Charges Distribution', fontsize=10)
        ax4.set_xlabel('Monthly Charges ($)')
        ax4.legend()

        # 5. Support Tickets
        ax5 = plt.subplot(3, 4, 5)
        df.groupby('support_tickets')['churn'].mean().plot(kind='bar', ax=ax5)
        ax5.set_title('Churn Rate by Support Tickets', fontsize=10)
        ax5.set_xlabel('Number of Support Tickets')
        ax5.set_ylabel('Churn Rate')

        # 6. Payment Delay
        ax6 = plt.subplot(3, 4, 6)
        df.boxplot(column='payment_delay_days', by='churn', ax=ax6)
        ax6.set_title('Payment Delay by Churn', fontsize=10)
        ax6.set_xlabel('Churn Status')
        ax6.set_ylabel('Payment Delay (days)')
        plt.suptitle('')

        # 7. Login Frequency
        ax7 = plt.subplot(3, 4, 7)
        df.boxplot(column='login_frequency', by='churn', ax=ax7)
        ax7.set_title('Login Frequency by Churn', fontsize=10)
        ax7.set_xlabel('Churn Status')
        ax7.set_ylabel('Login Frequency')
        plt.suptitle('')

        # 8. Feature Usage Score
        ax8 = plt.subplot(3, 4, 8)
        df.boxplot(column='feature_usage_score', by='churn', ax=ax8)
        ax8.set_title('Feature Usage by Churn', fontsize=10)
        ax8.set_xlabel('Churn Status')
        ax8.set_ylabel('Feature Usage Score')
        plt.suptitle('')

        # 9. Correlation Heatmap (top features)
        ax9 = plt.subplot(3, 4, 9)
        numeric_cols = ['tenure_months', 'monthly_charges', 'login_frequency',
                       'support_tickets', 'complaints', 'payment_delay_days', 'churn']
        corr_matrix = df[numeric_cols].corr()
        sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', ax=ax9, cbar_kws={'shrink': 0.8})
        ax9.set_title('Feature Correlation Matrix', fontsize=10)

        # 10. Churn by Location
        ax10 = plt.subplot(3, 4, 10)
        pd.crosstab(df['location'], df['churn'], normalize='index').plot(kind='bar', ax=ax10)
        ax10.set_title('Churn Rate by Location', fontsize=10)
        ax10.set_xlabel('Location')
        ax10.legend(['Retained', 'Churned'])
        plt.setp(ax10.xaxis.get_majorticklabels(), rotation=45, ha='right')

        # 11. Days Since Last Login
        ax11 = plt.subplot(3, 4, 11)
        df[df['churn']==0]['days_since_last_login'].hist(bins=30, alpha=0.5, label='Retained', ax=ax11)
        df[df['churn']==1]['days_since_last_login'].hist(bins=30, alpha=0.5, label='Churned', ax=ax11)
        ax11.set_title('Days Since Last Login', fontsize=10)
        ax11.set_xlabel('Days')
        ax11.legend()

        # 12. Complaints Distribution
        ax12 = plt.subplot(3, 4, 12)
        df.groupby('complaints')['churn'].mean().plot(kind='bar', ax=ax12, color='coral')
        ax12.set_title('Churn Rate by Complaints', fontsize=10)
        ax12.set_xlabel('Number of Complaints')
        ax12.set_ylabel('Churn Rate')

        plt.tight_layout()

        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')

        return fig


class ChurnPredictionModels:
    """Train and evaluate multiple ML models."""

    def __init__(self):
        self.models = {}
        self.results = {}
        self.best_model = None
        self.best_model_name = None

    def handle_class_imbalance(self, X_train, y_train, method='smote'):
        """Handle class imbalance using SMOTE or other techniques."""

        print("\n" + "="*70)
        print("HANDLING CLASS IMBALANCE")
        print("="*70)

        print(f"\nOriginal class distribution:")
        unique, counts = np.unique(y_train, return_counts=True)
        for u, c in zip(unique, counts):
            print(f"  Class {u}: {c} ({c/len(y_train)*100:.2f}%)")

        if method == 'smote':
            smote = SimpleSMOTE(random_state=42)
            X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
        else:
            X_resampled, y_resampled = X_train, y_train

        print(f"\nResampled class distribution:")
        unique, counts = np.unique(y_resampled, return_counts=True)
        for u, c in zip(unique, counts):
            print(f"  Class {u}: {c} ({c/len(y_resampled)*100:.2f}%)")

        return X_resampled, y_resampled

    def train_models(self, X_train, y_train, X_test, y_test):
        """Train multiple ML models."""

        print("\n" + "="*70)
        print("MODEL TRAINING")
        print("="*70)

        # Define models
        models_dict = {
            'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
            'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
            'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
            'SVM': SVC(kernel='rbf', probability=True, random_state=42),
            'Neural Network': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
        }

        # Train each model
        for name, model in models_dict.items():
            print(f"\nTraining {name}...")

            try:
                model.fit(X_train, y_train)
                self.models[name] = model

                # Predictions
                y_pred = model.predict(X_test)
                y_pred_proba = model.predict_proba(X_test)[:, 1]

                # Evaluate
                self.results[name] = self._evaluate_model(y_test, y_pred, y_pred_proba, name)

                print(f"  ✓ {name} trained successfully")
                print(f"    Accuracy: {self.results[name]['accuracy']:.4f}")
                print(f"    ROC-AUC: {self.results[name]['roc_auc']:.4f}")

            except Exception as e:
                print(f"  ✗ Error training {name}: {str(e)}")

    def _evaluate_model(self, y_true, y_pred, y_pred_proba, model_name):
        """Comprehensive model evaluation."""

        results = {
            'accuracy': accuracy_score(y_true, y_pred),
            'precision': precision_score(y_true, y_pred, zero_division=0),
            'recall': recall_score(y_true, y_pred, zero_division=0),
            'f1_score': f1_score(y_true, y_pred, zero_division=0),
            'roc_auc': roc_auc_score(y_true, y_pred_proba),
            'confusion_matrix': confusion_matrix(y_true, y_pred),
            'y_pred': y_pred,
            'y_pred_proba': y_pred_proba
        }

        return results

    def hyperparameter_tuning(self, X_train, y_train, model_name='Random Forest'):
        """Perform hyperparameter tuning using GridSearch."""

        print("\n" + "="*70)
        print(f"HYPERPARAMETER TUNING - {model_name}")
        print("="*70)

        if model_name == 'Random Forest':
            param_grid = {
                'n_estimators': [50, 100, 200],
                'max_depth': [10, 20, 30, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }
            base_model = RandomForestClassifier(random_state=42, n_jobs=-1)
        else:
            print(f"Hyperparameter tuning not configured for {model_name}")
            return None

        # GridSearchCV
        grid_search = GridSearchCV(
            estimator=base_model,
            param_grid=param_grid,
            cv=3,
            scoring='roc_auc',
            n_jobs=-1,
            verbose=1
        )

        print("Starting grid search...")
        grid_search.fit(X_train, y_train)

        print(f"\nBest parameters: {grid_search.best_params_}")
        print(f"Best ROC-AUC score: {grid_search.best_score_:.4f}")

        return grid_search.best_estimator_

    def plot_model_comparison(self, save_path=None):
        """Visualize model performance comparison."""

        fig = plt.figure(figsize=(18, 10))

        # Prepare data
        metrics = ['accuracy', 'precision', 'recall', 'f1_score', 'roc_auc']
        model_names = list(self.results.keys())

        # 1. Metrics Comparison
        ax1 = plt.subplot(2, 3, 1)
        metric_data = {metric: [self.results[model][metric] for model in model_names] for metric in metrics}
        x = np.arange(len(model_names))
        width = 0.15

        for i, metric in enumerate(metrics):
            ax1.bar(x + i*width, metric_data[metric], width, label=metric.replace('_', ' ').title())

        ax1.set_xlabel('Models')
        ax1.set_ylabel('Score')
        ax1.set_title('Model Performance Comparison', fontweight='bold')
        ax1.set_xticks(x + width * 2)
        ax1.set_xticklabels(model_names, rotation=45, ha='right')
        ax1.legend(loc='lower right')
        ax1.grid(axis='y', alpha=0.3)

        # 2-4. Confusion Matrices for top 3 models
        sorted_models = sorted(self.results.items(), key=lambda x: x[1]['roc_auc'], reverse=True)

        for idx, (model_name, results) in enumerate(sorted_models[:3], start=2):
            ax = plt.subplot(2, 3, idx)
            cm = results['confusion_matrix']
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax, cbar=False)
            ax.set_title(f'{model_name}\nAccuracy: {results["accuracy"]:.3f}', fontsize=10)
            ax.set_xlabel('Predicted')
            ax.set_ylabel('Actual')

        # 5. ROC Curves
        ax5 = plt.subplot(2, 3, 5)
        for model_name, results in self.results.items():
            y_test = results.get('y_true', None)
            if y_test is None:
                continue
            fpr, tpr, _ = roc_curve(y_test, results['y_pred_proba'])
            ax5.plot(fpr, tpr, label=f"{model_name} (AUC={results['roc_auc']:.3f})")

        ax5.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
        ax5.set_xlabel('False Positive Rate')
        ax5.set_ylabel('True Positive Rate')
        ax5.set_title('ROC Curves Comparison', fontweight='bold')
        ax5.legend(loc='lower right')
        ax5.grid(alpha=0.3)

        # 6. Precision-Recall Curves
        ax6 = plt.subplot(2, 3, 6)
        for model_name, results in self.results.items():
            y_test = results.get('y_true', None)
            if y_test is None:
                continue
            precision, recall, _ = precision_recall_curve(y_test, results['y_pred_proba'])
            ax6.plot(recall, precision, label=f"{model_name}")

        ax6.set_xlabel('Recall')
        ax6.set_ylabel('Precision')
        ax6.set_title('Precision-Recall Curves', fontweight='bold')
        ax6.legend(loc='lower left')
        ax6.grid(alpha=0.3)

        plt.tight_layout()

        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')

        return fig

    def select_best_model(self):
        """Select the best performing model."""

        best_auc = 0
        best_name = None

        for name, results in self.results.items():
            if results['roc_auc'] > best_auc:
                best_auc = results['roc_auc']
                best_name = name

        self.best_model = self.models[best_name]
        self.best_model_name = best_name

        print(f"\n{'='*70}")
        print(f"BEST MODEL: {best_name}")
        print(f"ROC-AUC: {best_auc:.4f}")
        print(f"{'='*70}")

        return best_name, self.best_model


class ModelExplainability:
    """Apply SHAP for model interpretability."""

    def __init__(self, model, X_train, feature_names):
        self.model = model
        self.X_train = X_train
        self.feature_names = feature_names
        self.explainer = None
        self.shap_values = None

    def compute_shap_values(self, X_test, sample_size=100):
        """Compute SHAP values for model predictions."""

        print("\n" + "="*70)
        print("MODEL EXPLAINABILITY (SHAP)")
        print("="*70)

        if not SHAP_AVAILABLE:
            print("SHAP not available. Using feature importance from model instead.")
            return None

        try:
            # Use TreeExplainer for tree-based models
            if hasattr(self.model, 'tree_') or 'Forest' in str(type(self.model)) or 'XGB' in str(type(self.model)):
                import shap
                self.explainer = shap.TreeExplainer(self.model)
            else:
                # Use KernelExplainer for other models
                import shap
                background = shap.kmeans(self.X_train, 10)
                self.explainer = shap.KernelExplainer(self.model.predict_proba, background)

            # Compute SHAP values on sample
            X_sample = X_test[:sample_size] if len(X_test) > sample_size else X_test
            self.shap_values = self.explainer.shap_values(X_sample)

            # For binary classification, get positive class SHAP values
            if isinstance(self.shap_values, list):
                self.shap_values = self.shap_values[1]

            print(f"✓ SHAP values computed for {len(X_sample)} samples")

            return self.shap_values

        except Exception as e:
            print(f"✗ Error computing SHAP values: {str(e)}")
            return None

    def plot_feature_importance(self, save_path=None):
        """Plot feature importance."""

        if self.shap_values is None and not SHAP_AVAILABLE:
            # Use model's feature importance instead
            if hasattr(self.model, 'feature_importances_'):
                importances = self.model.feature_importances_
            elif hasattr(self.model, 'coef_'):
                importances = np.abs(self.model.coef_[0])
            else:
                print("Feature importance not available.")
                return None

            # Create bar plot
            fig, ax = plt.subplots(figsize=(12, 8))

            # Sort by importance
            indices = np.argsort(importances)[-20:]  # Top 20

            ax.barh(range(len(indices)), importances[indices])
            ax.set_yticks(range(len(indices)))
            ax.set_yticklabels([self.feature_names[i] for i in indices])
            ax.set_xlabel('Importance')
            ax.set_title('Top Features Influencing Churn (Model Feature Importance)',
                        fontweight='bold', fontsize=14)
            ax.grid(axis='x', alpha=0.3)

            plt.tight_layout()

            if save_path:
                plt.savefig(save_path, dpi=300, bbox_inches='tight')

            return fig

        if self.shap_values is None:
            print("SHAP values not computed yet.")
            return None

        try:
            import shap
            fig = plt.figure(figsize=(12, 8))

            # Summary plot
            shap.summary_plot(
                self.shap_values,
                features=self.X_train[:100] if len(self.X_train) > 100 else self.X_train,
                feature_names=self.feature_names,
                plot_type='bar',
                show=False
            )
            plt.title('Top Features Influencing Churn (SHAP)', fontweight='bold', fontsize=14)
            plt.tight_layout()

            if save_path:
                plt.savefig(save_path, dpi=300, bbox_inches='tight')

            return fig

        except Exception as e:
            print(f"Error plotting SHAP: {str(e)}")
            return None

    def get_feature_importance_scores(self, top_n=15):
        """Get top N most important features."""

        if self.shap_values is None:
            # Fallback to model feature importance
            if hasattr(self.model, 'feature_importances_'):
                importances = self.model.feature_importances_
            elif hasattr(self.model, 'coef_'):
                importances = np.abs(self.model.coef_[0])
            else:
                print("Feature importance not available for this model.")
                return None
        else:
            # Use SHAP values
            importances = np.abs(self.shap_values).mean(axis=0)

        # Create DataFrame
        feature_imp_df = pd.DataFrame({
            'feature': self.feature_names,
            'importance': importances
        }).sort_values('importance', ascending=False).head(top_n)

        print("\nTop Features Influencing Churn:")
        print(feature_imp_df.to_string(index=False))

        return feature_imp_df


class CustomerSegmentation:
    """Cluster customers into behavioral segments."""

    def __init__(self, n_clusters=4):
        self.n_clusters = n_clusters
        self.kmeans = None
        self.segment_profiles = None

    def perform_segmentation(self, df: pd.DataFrame, feature_cols: List[str]):
        """Perform K-Means clustering."""

        print("\n" + "="*70)
        print("CUSTOMER SEGMENTATION")
        print("="*70)

        # Select features for clustering
        X_cluster = df[feature_cols].values

        # Standardize
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_cluster)

        # K-Means
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=42, n_init=10)
        clusters = self.kmeans.fit_predict(X_scaled)

        df['segment'] = clusters

        # Analyze segments
        self._analyze_segments(df)

        print(f"\n✓ Customers segmented into {self.n_clusters} groups")

        return df

    def _analyze_segments(self, df: pd.DataFrame):
        """Analyze and profile customer segments."""

        print("\nSegment Profiles:")
        print("="*70)

        segment_summary = df.groupby('segment').agg({
            'tenure_months': 'mean',
            'monthly_charges': 'mean',
            'login_frequency': 'mean',
            'support_tickets': 'mean',
            'churn': 'mean',
            'customer_id': 'count'
        }).round(2)

        segment_summary.columns = ['Avg Tenure', 'Avg Charges', 'Avg Logins',
                                   'Avg Tickets', 'Churn Rate', 'Count']

        # Name segments
        segment_names = []
        for idx, row in segment_summary.iterrows():
            if row['Churn Rate'] > 0.5:
                name = "High Risk"
            elif row['Avg Tenure'] > 24 and row['Churn Rate'] < 0.2:
                name = "Loyal Customers"
            elif row['Avg Logins'] < 10:
                name = "Low Engagement"
            else:
                name = "Standard"
            segment_names.append(name)

        segment_summary['Segment Name'] = segment_names

        print(segment_summary)

        self.segment_profiles = segment_summary

    def visualize_segments(self, df: pd.DataFrame, save_path=None):
        """Visualize customer segments."""

        fig = plt.figure(figsize=(16, 10))

        # PCA for 2D visualization
        feature_cols = ['tenure_months', 'monthly_charges', 'login_frequency',
                       'support_tickets', 'complaints', 'payment_delay_days']
        X = df[feature_cols].values

        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(StandardScaler().fit_transform(X))

        # 1. Segment Distribution
        ax1 = plt.subplot(2, 3, 1)
        df['segment'].value_counts().plot(kind='bar', ax=ax1, color='skyblue')
        ax1.set_title('Segment Distribution', fontweight='bold')
        ax1.set_xlabel('Segment')
        ax1.set_ylabel('Count')

        # 2. PCA Visualization
        ax2 = plt.subplot(2, 3, 2)
        scatter = ax2.scatter(X_pca[:, 0], X_pca[:, 1], c=df['segment'], cmap='viridis', alpha=0.6)
        ax2.set_title('Customer Segments (PCA)', fontweight='bold')
        ax2.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
        ax2.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
        plt.colorbar(scatter, ax=ax2, label='Segment')

        # 3. Churn Rate by Segment
        ax3 = plt.subplot(2, 3, 3)
        df.groupby('segment')['churn'].mean().plot(kind='bar', ax=ax3, color='coral')
        ax3.set_title('Churn Rate by Segment', fontweight='bold')
        ax3.set_xlabel('Segment')
        ax3.set_ylabel('Churn Rate')
        ax3.axhline(df['churn'].mean(), color='red', linestyle='--', label='Overall Avg')
        ax3.legend()

        # 4. Tenure by Segment
        ax4 = plt.subplot(2, 3, 4)
        df.boxplot(column='tenure_months', by='segment', ax=ax4)
        ax4.set_title('Tenure Distribution by Segment')
        ax4.set_xlabel('Segment')
        ax4.set_ylabel('Tenure (months)')
        plt.suptitle('')

        # 5. Monthly Charges by Segment
        ax5 = plt.subplot(2, 3, 5)
        df.boxplot(column='monthly_charges', by='segment', ax=ax5)
        ax5.set_title('Monthly Charges by Segment')
        ax5.set_xlabel('Segment')
        ax5.set_ylabel('Monthly Charges ($)')
        plt.suptitle('')

        # 6. Engagement by Segment
        ax6 = plt.subplot(2, 3, 6)
        df.boxplot(column='login_frequency', by='segment', ax=ax6)
        ax6.set_title('Login Frequency by Segment')
        ax6.set_xlabel('Segment')
        ax6.set_ylabel('Login Frequency')
        plt.suptitle('')

        plt.tight_layout()

        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')

        return fig


class ChurnPreventionEngine:
    """Generate personalized retention strategies."""

    @staticmethod
    def generate_strategies(df: pd.DataFrame, churn_proba: np.ndarray) -> pd.DataFrame:
        """Create personalized prevention strategies for high-risk customers."""

        print("\n" + "="*70)
        print("CHURN PREVENTION STRATEGIES")
        print("="*70)

        df = df.copy()
        df['churn_probability'] = churn_proba

        # Identify high-risk customers (top 20% churn probability)
        high_risk_threshold = df['churn_probability'].quantile(0.80)
        high_risk_customers = df[df['churn_probability'] >= high_risk_threshold].copy()

        print(f"\nIdentified {len(high_risk_customers)} high-risk customers")
        print(f"(Churn probability >= {high_risk_threshold:.2%})")

        # Generate personalized strategies
        strategies = []

        for _, customer in high_risk_customers.iterrows():
            strategy = {
                'customer_id': customer['customer_id'],
                'churn_probability': customer['churn_probability'],
                'segment': customer.get('segment', 'Unknown'),
                'primary_issues': [],
                'recommended_actions': [],
                'priority': 'High' if customer['churn_probability'] > 0.8 else 'Medium'
            }

            # Identify issues
            if customer.get('support_tickets', 0) > 3:
                strategy['primary_issues'].append('High support tickets')
                strategy['recommended_actions'].append('Priority support escalation')

            if customer.get('complaints', 0) > 2:
                strategy['primary_issues'].append('Multiple complaints')
                strategy['recommended_actions'].append('Personal outreach from account manager')

            if customer.get('payment_delay_days', 0) > 10:
                strategy['primary_issues'].append('Payment delays')
                strategy['recommended_actions'].append('Flexible payment plan offer')

            if customer.get('login_frequency', 30) < 10:
                strategy['primary_issues'].append('Low engagement')
                strategy['recommended_actions'].append('Re-engagement campaign with tutorial')

            if customer.get('days_since_last_login', 0) > 20:
                strategy['primary_issues'].append('Inactive account')
                strategy['recommended_actions'].append('Win-back offer with discount')

            if customer.get('contract_type', '') == 'Month-to-Month':
                strategy['primary_issues'].append('No long-term commitment')
                strategy['recommended_actions'].append('Annual contract upgrade incentive')

            if customer.get('monthly_charges', 50) > 100:
                strategy['primary_issues'].append('High price point')
                strategy['recommended_actions'].append('Customized package review')

            if not strategy['primary_issues']:
                strategy['primary_issues'].append('General churn risk')
                strategy['recommended_actions'].append('Customer satisfaction survey')

            strategies.append(strategy)

        # Create DataFrame
        strategies_df = pd.DataFrame(strategies)

        # Display sample strategies
        print("\nSample Prevention Strategies:")
        print("="*70)
        for i, strategy in enumerate(strategies[:3], 1):
            print(f"\nCustomer {i}: {strategy['customer_id']}")
            print(f"  Churn Probability: {strategy['churn_probability']:.1%}")
            print(f"  Priority: {strategy['priority']}")
            print(f"  Issues: {', '.join(strategy['primary_issues'])}")
            print(f"  Actions: {'; '.join(strategy['recommended_actions'])}")

        return strategies_df


class ModelPipeline:
    """Complete ML pipeline for training and inference."""

    def __init__(self):
        self.preprocessor = DataPreprocessor()
        self.model = None
        self.feature_names = None

    def train_pipeline(self, df: pd.DataFrame):
        """Train the complete pipeline."""

        # Clean data
        df_clean = self.preprocessor.clean_data(df)

        # Engineer features
        df_features = self.preprocessor.engineer_features(df_clean)

        # Encode categorical
        df_encoded = self.preprocessor.encode_categorical(df_features, fit=True)

        # Prepare features
        X, y = self.preprocessor.prepare_features(df_encoded, fit=True)

        self.feature_names = self.preprocessor.feature_names

        return X, y

    def predict_pipeline(self, df: pd.DataFrame) -> np.ndarray:
        """Inference pipeline for new data."""

        # Clean data
        df_clean = self.preprocessor.clean_data(df)

        # Engineer features
        df_features = self.preprocessor.engineer_features(df_clean)

        # Encode categorical
        df_encoded = self.preprocessor.encode_categorical(df_features, fit=False)

        # Prepare features
        X, _ = self.preprocessor.prepare_features(df_encoded, fit=False)

        # Predict
        predictions = self.model.predict_proba(X)[:, 1]

        return predictions

    def save_pipeline(self, model, filepath: str):
        """Save the complete pipeline."""

        self.model = model

        pipeline_data = {
            'model': model,
            'preprocessor': self.preprocessor,
            'feature_names': self.feature_names
        }

        with open(filepath, 'wb') as f:
            pickle.dump(pipeline_data, f)

        print(f"\n✓ Pipeline saved to {filepath}")

    @staticmethod
    def load_pipeline(filepath: str):
        """Load a saved pipeline."""

        with open(filepath, 'rb') as f:
            pipeline_data = pickle.load(f)

        pipeline = ModelPipeline()
        pipeline.model = pipeline_data['model']
        pipeline.preprocessor = pipeline_data['preprocessor']
        pipeline.feature_names = pipeline_data['feature_names']

        print(f"✓ Pipeline loaded from {filepath}")

        return pipeline

def main():
    """Execute the complete churn prediction system."""

    print("\n" + "="*70)
    print("CUSTOMER CHURN PREDICTION & BEHAVIORAL ANALYTICS SYSTEM")
    print("="*70)
    print("A comprehensive end-to-end machine learning solution")
    print("="*70 + "\n")

    # Create output directory
    import os
    os.makedirs('outputs', exist_ok=True)

    print("\n[STEP 1] Generating Customer Data...")
    data_generator = CustomerDataGenerator(n_samples=10000, random_state=42)
    df_raw = data_generator.generate_data()
    df_raw.to_csv('outputs/raw_customer_data.csv', index=False)
    print(f"✓ Generated {len(df_raw)} customer records")

    print("\n[STEP 2-3] Data Preprocessing and Feature Engineering...")
    preprocessor = DataPreprocessor()
    df_clean = preprocessor.clean_data(df_raw)
    df_features = preprocessor.engineer_features(df_clean)
    df_processed = preprocessor.encode_categorical(df_features, fit=True)
    df_processed.to_csv('outputs/processed_customer_data.csv', index=False)

    print("\n[STEP 4] Exploratory Data Analysis...")
    eda = ExploratoryAnalysis()
    summary = eda.generate_statistical_summary(df_features)
    summary.to_csv('outputs/statistical_summary.csv')

    fig_eda = eda.plot_churn_analysis(df_features, save_path='outputs/eda_visualizations.png')
    plt.close(fig_eda)
    print("✓ EDA visualizations saved")

    # Prepare data for modeling
    X, y = preprocessor.prepare_features(df_processed, fit=True)
    feature_names = preprocessor.feature_names

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    print("\n[STEP 5] Handling Class Imbalance...")
    model_trainer = ChurnPredictionModels()
    X_train_balanced, y_train_balanced = model_trainer.handle_class_imbalance(
        X_train, y_train, method='smote'
    )

    print("\n[STEP 6-8] Training Multiple ML Models...")
    model_trainer.train_models(X_train_balanced, y_train_balanced, X_test, y_test)

    # Store y_test in results for plotting
    for model_name in model_trainer.results:
        model_trainer.results[model_name]['y_true'] = y_test

    # Plot model comparison
    fig_models = model_trainer.plot_model_comparison(save_path='outputs/model_comparison.png')
    plt.close(fig_models)
    print("✓ Model comparison saved")

    # Select best model
    best_model_name, best_model = model_trainer.select_best_model()

    print("\n[STEP 7] Hyperparameter Tuning...")
    if best_model_name in ['Random Forest', 'XGBoost']:
        tuned_model = model_trainer.hyperparameter_tuning(
            X_train_balanced, y_train_balanced, model_name=best_model_name
        )
        if tuned_model:
            best_model = tuned_model
            # Re-evaluate tuned model
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.predict_proba(X_test)[:, 1]
            print(f"\nTuned model ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")

    print("\n[STEP 9] Applying Explainable AI (SHAP)...")
    explainer = ModelExplainability(best_model, X_train_balanced, feature_names)
    shap_values = explainer.compute_shap_values(X_test, sample_size=100)

    if shap_values is not None:
        fig_shap = explainer.plot_feature_importance(save_path='outputs/shap_feature_importance.png')
        if fig_shap:
            plt.close(fig_shap)

    feature_importance_df = explainer.get_feature_importance_scores(top_n=15)
    if feature_importance_df is not None:
        feature_importance_df.to_csv('outputs/feature_importance.csv', index=False)

    print("\n[STEP 10] Customer Segmentation...")
    segmenter = CustomerSegmentation(n_clusters=4)
    cluster_features = ['tenure_months', 'monthly_charges', 'login_frequency',
                       'support_tickets', 'complaints', 'payment_delay_days']
    df_segmented = segmenter.perform_segmentation(df_features, cluster_features)

    fig_segments = segmenter.visualize_segments(df_segmented, save_path='outputs/customer_segments.png')
    plt.close(fig_segments)

    df_segmented.to_csv('outputs/segmented_customers.csv', index=False)
    print("✓ Customer segmentation completed")

    print("\n[STEP 11] Generating Churn Prevention Strategies...")
    churn_proba = best_model.predict_proba(X)[:, 1]
    df_segmented['churn_probability'] = churn_proba

    prevention_engine = ChurnPreventionEngine()
    strategies_df = prevention_engine.generate_strategies(df_segmented, churn_proba)
    strategies_df.to_csv('outputs/churn_prevention_strategies.csv', index=False)
    print("✓ Prevention strategies generated")

    print("\n[STEP 12] Saving Models and Pipelines...")
    pipeline = ModelPipeline()
    pipeline.preprocessor = preprocessor
    pipeline.feature_names = feature_names
    pipeline.save_pipeline(best_model, 'outputs/churn_prediction_pipeline.pkl')

    # Save all models
    with open('outputs/all_trained_models.pkl', 'wb') as f:
        pickle.dump(model_trainer.models, f)
    print("✓ All models saved")

    print("\n[STEP 13] Testing Inference Pipeline...")

    # Load pipeline
    loaded_pipeline = ModelPipeline.load_pipeline('outputs/churn_prediction_pipeline.pkl')

    # Test on new data (using a small sample from raw data)
    new_customers = df_raw.sample(10, random_state=42).copy()
    new_customers = new_customers.drop('churn', axis=1)  # Remove target

    predictions = loaded_pipeline.predict_pipeline(new_customers)

    new_customers['churn_probability'] = predictions
    new_customers['risk_level'] = pd.cut(predictions,
                                         bins=[0, 0.3, 0.6, 1.0],
                                         labels=['Low', 'Medium', 'High'])

    print("\nSample Predictions on New Customers:")
    print(new_customers[['customer_id', 'tenure_months', 'monthly_charges',
                         'churn_probability', 'risk_level']].to_string(index=False))

    new_customers.to_csv('outputs/sample_predictions.csv', index=False)

    print("\n[STEP 14] Generating Business Insights Report...")

    insights = {
        'executive_summary': {
            'total_customers': len(df_raw),
            'overall_churn_rate': f"{df_raw['churn'].mean() * 100:.2f}%",
            'high_risk_customers': len(strategies_df),
            'best_model': best_model_name,
            'model_accuracy': f"{model_trainer.results[best_model_name]['accuracy']:.2%}",
            'model_roc_auc': f"{model_trainer.results[best_model_name]['roc_auc']:.4f}"
        },
        'key_churn_drivers': feature_importance_df.to_dict('records') if feature_importance_df is not None else [],
        'segment_insights': segmenter.segment_profiles.to_dict() if segmenter.segment_profiles is not None else {},
        'recommendations': [
            "1. Focus retention efforts on month-to-month contract customers",
            "2. Proactively address customers with 2+ support tickets",
            "3. Implement re-engagement campaigns for low-activity users",
            "4. Offer payment flexibility to customers with payment delays",
            "5. Provide personalized upgrade paths to high-value segments",
            "6. Monitor engagement decay patterns weekly",
            "7. Establish early warning system for customers inactive >14 days"
        ]
    }

    with open('outputs/business_insights_report.json', 'w') as f:
        json.dump(insights, f, indent=2, default=str)

    print("\n" + "="*70)
    print("SYSTEM EXECUTION COMPLETED SUCCESSFULLY")
    print("="*70)
    print("\nOutputs Generated:")
    print("  1. raw_customer_data.csv - Original dataset")
    print("  2. processed_customer_data.csv - Cleaned & engineered features")
    print("  3. statistical_summary.csv - Comprehensive EDA summary")
    print("  4. eda_visualizations.png - Exploratory visualizations")
    print("  5. model_comparison.png - Model performance comparison")
    print("  6. shap_feature_importance.png - Feature importance analysis")
    print("  7. feature_importance.csv - Top churn predictors")
    print("  8. customer_segments.png - Segmentation visualizations")
    print("  9. segmented_customers.csv - Customers with segments")
    print("  10. churn_prevention_strategies.csv - Actionable strategies")
    print("  11. churn_prediction_pipeline.pkl - Production-ready pipeline")
    print("  12. all_trained_models.pkl - All trained models")
    print("  13. sample_predictions.csv - Inference demo results")
    print("  14. business_insights_report.json - Executive summary")
    print("\n" + "="*70 + "\n")

    return insights


if __name__ == "__main__":
    # Run main system
    insights = main()


CUSTOMER CHURN PREDICTION & BEHAVIORAL ANALYTICS SYSTEM
A comprehensive end-to-end machine learning solution


[STEP 1] Generating Customer Data...
✓ Generated 10000 customer records

[STEP 2-3] Data Preprocessing and Feature Engineering...

DATA CLEANING

Missing Values Before Cleaning:
age                    641
total_charges          713
feature_usage_score    683
download_volume_gb     748
dtype: int64

monthly_charges: Capped 109 outliers

total_charges: Capped 132 outliers

download_volume_gb: Capped 698 outliers

Missing Values After Cleaning:
0 total missing values

FEATURE ENGINEERING

Created 32 features
New engineered features: 14

[STEP 4] Exploratory Data Analysis...

STATISTICAL SUMMARY

Dataset Shape: (10000, 34)

Churn Distribution:
churn
1    7774
0    2226
Name: count, dtype: int64

Churn Rate: 77.74%
✓ EDA visualizations saved

[STEP 5] Handling Class Imbalance...

HANDLING CLASS IMBALANCE

Original class distribution:
  Class 0: 1781 (22.26%)
  Class 1: 6219 (77.74