In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (roc_auc_score, precision_recall_curve,
                           average_precision_score, confusion_matrix,
                           classification_report)
from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings('ignore')

In [2]:
class AdvancedLeadScoringModel:
    """
    Lead Scoring Model with interpretability features.
    """

    def __init__(self):
        self.scaler = StandardScaler()
        self.model = RandomForestClassifier(
            n_estimators=100,
            class_weight='balanced',
            random_state=42
        )
        self.feature_importance = None
        self.threshold = 0.5
        self.documentation = {}

    def _safe_division(self, a, b):
        """Safely perform division handling zeros and infinities"""
        return np.where(b != 0, a / b, 0)

    def prepare_features(self, df):
        """
        Prepare features for the model
        """
        try:
            features = pd.DataFrame()
            feature_docs = []

            # Base metrics
            base_metrics = [
                'Outbound Calls (last month)',
                'Personalized Outbound Emails (last month)',
                'Demo Meeting Set (last month)',
                'Demo Meeting Completed (last month)'
            ]

            for metric in base_metrics:
                features[metric] = df[metric]
                feature_docs.append({
                    'name': metric,
                    'type': 'base_metric',
                    'description': f'Raw count of {metric.split("(")[0].strip()}'
                })

            # Activity ratios
            ratios = {
                'email_to_call_ratio': (
                    df['Personalized Outbound Emails (last month)'],
                    df['Outbound Calls (last month)']
                ),
                'contact_rate': (
                    df['Calls with Correct Contact (last month)'],
                    df['Outbound Calls (last month)']
                ),
                'demo_set_rate': (
                    df['Demo Meeting Set (last month)'],
                    df['Calls with Correct Contact (last month)']
                ),
                'demo_completion_rate': (
                    df['Demo Meeting Completed (last month)'],
                    df['Demo Meeting Set (last month)']
                )
            }

            for name, (numerator, denominator) in ratios.items():
                features[name] = self._safe_division(numerator, denominator)
                feature_docs.append({
                    'name': name,
                    'type': 'ratio',
                    'description': f'Ratio of {numerator.name} to {denominator.name}',
                    'calculation': f'{numerator.name} / {denominator.name}'
                })

            # Month-over-month changes
            metrics = ['Outbound Calls', 'Demo Meeting Set', 'Opportunity Created']
            for metric in metrics:
                current = f'{metric} (last month)'
                previous = f'{metric} (month before last)'
                change_name = f'{metric}_change'

                change = self._safe_division(
                    df[current] - df[previous],
                    df[previous]
                )
                features[change_name] = np.clip(change, -1, 1)

                feature_docs.append({
                    'name': change_name,
                    'type': 'trend',
                    'description': f'Month-over-month change in {metric}',
                    'calculation': f'({current} - {previous}) / {previous}'
                })

            # Store feature documentation
            self.documentation['features'] = feature_docs

            # Clean data
            features = features.replace([np.inf, -np.inf], 0)
            features = features.fillna(0)

            return features

        except Exception as e:
            print(f"Error in preparing features: {str(e)}")
            return pd.DataFrame()

    def train_with_validation(self, df):
        """
        Train the model with comprehensive validation
        """
        try:
            # Prepare features and target
            X = self.prepare_features(df)
            y = (df['Opportunity Created (last month)'] > 0).astype(int)

            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42, stratify=y
            )

            # Scale features
            X_train_scaled = self.scaler.fit_transform(X_train)
            X_test_scaled = self.scaler.transform(X_test)

            # Cross-validation with multiple metrics
            cv = KFold(n_splits=5, shuffle=True, random_state=42)
            cv_metrics = {}

            # Calculate cross-validation scores for each metric separately
            scoring_metrics = ['accuracy', 'roc_auc', 'precision', 'recall']
            for metric in scoring_metrics:
                scores = cross_val_score(
                    self.model, X_train_scaled, y_train,
                    cv=cv, scoring=metric
                )
                cv_metrics[metric] = {
                    'mean': scores.mean(),
                    'std': scores.std()
                }

            # Train final model
            self.model.fit(X_train_scaled, y_train)

            # Calculate feature importance
            self.feature_importance = pd.DataFrame({
                'feature': X.columns,
                'importance': self.model.feature_importances_
            }).sort_values('importance', ascending=False)

            # Calculate permutation importance
            perm_importance = permutation_importance(
                self.model, X_test_scaled, y_test,
                n_repeats=10, random_state=42
            )

            # Generate model documentation
            model_doc = {
                'model_type': 'RandomForestClassifier',
                'parameters': self.model.get_params(),
                'feature_count': len(X.columns),
                'training_samples': len(X_train),
                'test_samples': len(X_test),
                'class_distribution': {
                    'positive': y.sum(),
                    'negative': len(y) - y.sum()
                }
            }

            # Calculate detailed metrics
            y_pred_proba = self.model.predict_proba(X_test_scaled)[:, 1]
            y_pred = y_pred_proba > self.threshold

            metrics = {
                'cross_validation': cv_metrics,
                'test_performance': {
                    'classification_report': classification_report(
                        y_test, y_pred, output_dict=True
                    ),
                    'confusion_matrix': confusion_matrix(y_test, y_pred).tolist(),
                    'roc_auc': roc_auc_score(y_test, y_pred_proba)
                },
                'feature_importance': {
                    'random_forest': self.feature_importance.to_dict('records'),
                    'permutation': {
                        'mean': perm_importance.importances_mean.tolist(),
                        'std': perm_importance.importances_std.tolist()
                    }
                }
            }

            # Store documentation
            self.documentation.update({
                'model': model_doc,
                'metrics': metrics
            })

            return {
                'model_doc': model_doc,
                'metrics': metrics,
                'feature_importance': self.feature_importance
            }

        except Exception as e:
            print(f"Error in training model: {str(e)}")
            return None

    def score_leads(self, df):
        """
        Score new leads using the trained model
        """
        try:
            # Prepare features
            X = self.prepare_features(df)
            if X.empty:
                raise ValueError("Failed to prepare features")

            # Scale features
            X_scaled = self.scaler.transform(X)

            # Get probability scores
            scores = self.model.predict_proba(X_scaled)[:, 1]

            # Create scoring summary
            scoring_summary = pd.DataFrame({
                'user_name': df['user_name'],
                'lead_score': scores,
                'probability': scores,
                'is_likely_opportunity': scores > self.threshold
            })

            # Add score categories
            scoring_summary['category'] = pd.qcut(
                scores,
                q=5,
                labels=['Very Low', 'Low', 'Medium', 'High', 'Very High']
            )

            return scoring_summary

        except Exception as e:
            print(f"Error in scoring leads: {str(e)}")
            return None

In [3]:
def main(df):
    """
    Main function to run the lead scoring system
    """
    try:
        # Initialize model
        model = AdvancedLeadScoringModel()

        # Train and validate model
        training_results = model.train_with_validation(df)
        if training_results is None:
            raise ValueError("Model training failed")

        # Score leads
        scoring_results = model.score_leads(df)
        if scoring_results is None:
            raise ValueError("Lead scoring failed")

        # Compile complete documentation
        documentation = {
            'model_documentation': model.documentation,
            'training_results': training_results,
            'scoring_results': scoring_results
        }

        return documentation

    except Exception as e:
        print(f"Error in main execution: {str(e)}")
        return None

if __name__ == "__main__":



  # Read data
  df = pd.read_csv('monthly_report_expanded.csv')

  # Run analysis
  results = main(df)

  if results is not None:
      print("\nModel Documentation:")
      print(results['model_documentation']['model'])

      print("\nCross-validation Results:")
      print(results['model_documentation']['metrics']['cross_validation'])

      print("\nFeature Importance:")
      print(results['training_results']['feature_importance'].head())

      print("\nLead Scoring Results:")
      print(results['scoring_results'].head())
  else:
      print("Analysis failed to complete")





Model Documentation:
{'model_type': 'RandomForestClassifier', 'parameters': {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}, 'feature_count': 11, 'training_samples': 40, 'test_samples': 10, 'class_distribution': {'positive': 25, 'negative': 25}}

Cross-validation Results:
{'accuracy': {'mean': 0.925, 'std': 0.1}, 'roc_auc': {'mean': 0.9866666666666667, 'std': 0.02666666666666666}, 'precision': {'mean': 1.0, 'std': 0.0}, 'recall': {'mean': 0.9028571428571428, 'std': 0.1220237514517864}}

Feature Importance:
                               feature  importance
3  Demo Meeting Completed (last month)    0.324152
2        Demo