In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

def generate_synthetic_expense_data(num_records=1000, num_employees=50):
    """
    Generate synthetic expense report data with some intentional anomalies and policy violations.
    """
    # Static data
    departments = ['Sales', 'Engineering', 'Marketing', 'HR', 'Finance']
    expense_categories = ['Travel', 'Meals', 'Office Supplies', 'Technology', 'Training']
    vendors = ['Airlines Corp', 'Hotel Chain', 'Restaurant A', 'Office Store', 'Tech Store', 
               'Training Corp', 'Local Restaurant', 'Online Retailer']
    payment_methods = ['Corporate Card', 'Personal Card', 'Cash']
    
    # Budget limits per category
    category_limits = {
        'Travel': 5000,
        'Meals': 200,
        'Office Supplies': 500,
        'Technology': 2000,
        'Training': 1500
    }
    
    # Department-specific authorized categories
    dept_categories = {
        'Sales': ['Travel', 'Meals', 'Technology'],
        'Engineering': ['Technology', 'Training', 'Office Supplies'],
        'Marketing': ['Travel', 'Meals', 'Technology', 'Training'],
        'HR': ['Office Supplies', 'Training'],
        'Finance': ['Office Supplies', 'Technology']
    }
    
    # Generate base data
    data = []
    start_date = datetime(2024, 1, 1)
    
    for _ in range(num_records):
        employee_id = f'EMP{random.randint(1, num_employees):03d}'
        department = random.choice(departments)
        submission_date = start_date + timedelta(days=random.randint(0, 365))
        expense_date = submission_date - timedelta(days=random.randint(0, 30))
        
        # Intentionally create some policy violations
        is_violation = random.random() < 0.15  # 15% chance of violation
        
        if is_violation:
            # Generate a violation case
            violation_type = random.choice(['over_budget', 'unauthorized', 'duplicate'])
            
            if violation_type == 'over_budget':
                category = random.choice(list(category_limits.keys()))
                amount = category_limits[category] * random.uniform(1.1, 2.0)  # Exceed limit
            elif violation_type == 'unauthorized':
                category = random.choice(expense_categories)
                while category in dept_categories[department]:
                    category = random.choice(expense_categories)
                amount = random.uniform(50, 500)
            else:  # duplicate
                category = random.choice(expense_categories)
                amount = random.uniform(50, category_limits[category])
        else:
            # Generate a normal case
            category = random.choice(dept_categories[department])
            amount = random.uniform(50, category_limits[category] * 0.8)
        
        record = {
            'expense_id': f'EXP{_:06d}',
            'employee_id': employee_id,
            'department': department,
            'submission_date': submission_date.strftime('%Y-%m-%d'),
            'expense_date': expense_date.strftime('%Y-%m-%d'),
            'amount': round(amount, 2),
            'currency': 'USD',
            'vendor': random.choice(vendors),
            'category': category,
            'payment_method': random.choice(payment_methods),
            'has_justification': random.random() > 0.1,  # 10% chance of missing justification
            'is_violation': is_violation
        }
        data.append(record)
    
    df = pd.DataFrame(data)
    return df

# Generate synthetic data
df = generate_synthetic_expense_data()

# Save to CSV
df.to_csv('synthetic_expense_data.csv', index=False)

# Print sample statistics
print("\nDataset Statistics:")
print(f"Total Records: {len(df)}")
print(f"Violation Rate: {(df['is_violation'].sum() / len(df)) * 100:.2f}%")
print("\nSample Records:")
print(df.head())

In [None]:
!pip install faker

In [None]:
!pip install pandas numpy scikit-learn xgboost joblib datetime
!pip install pandas numpy scikit-learn xgboost joblib

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

def generate_synthetic_expense_data(num_records=1000, num_employees=50):
    """
    Generate synthetic expense report data with intentional anomalies and policy violations.
    """
    # Static data
    departments = ['Sales', 'Engineering', 'Marketing', 'HR', 'Finance']
    expense_categories = ['Travel', 'Meals', 'Office Supplies', 'Technology', 'Training']
    vendors = ['Airlines Corp', 'Hotel Chain', 'Restaurant A', 'Office Store', 'Tech Store', 
               'Training Corp', 'Local Restaurant', 'Online Retailer']
    payment_methods = ['Corporate Card', 'Personal Card', 'Cash']
    
    # Budget limits per category
    category_limits = {
        'Travel': 5000,
        'Meals': 200,
        'Office Supplies': 500,
        'Technology': 2000,
        'Training': 1500
    }
    
    # Department-specific authorized categories
    dept_categories = {
        'Sales': ['Travel', 'Meals', 'Technology'],
        'Engineering': ['Technology', 'Training', 'Office Supplies'],
        'Marketing': ['Travel', 'Meals', 'Technology', 'Training'],
        'HR': ['Office Supplies', 'Training'],
        'Finance': ['Office Supplies', 'Technology']
    }
    
    # Generate base data
    data = []
    start_date = datetime(2024, 1, 1)
    
    for i in range(num_records):
        employee_id = f'EMP{random.randint(1, num_employees):03d}'
        department = random.choice(departments)
        submission_date = start_date + timedelta(days=random.randint(0, 365))
        expense_date = submission_date - timedelta(days=random.randint(0, 30))
        
        # Intentionally create some policy violations
        is_violation = random.random() < 0.15  # 15% chance of violation
        
        if is_violation:
            # Generate a violation case
            violation_type = random.choice(['over_budget', 'unauthorized', 'duplicate'])
            
            if violation_type == 'over_budget':
                category = random.choice(list(category_limits.keys()))
                amount = category_limits[category] * random.uniform(1.1, 2.0)  # Exceed limit
            elif violation_type == 'unauthorized':
                category = random.choice(expense_categories)
                while category in dept_categories[department]:
                    category = random.choice(expense_categories)
                amount = random.uniform(50, 500)
            else:  # duplicate
                category = random.choice(expense_categories)
                amount = random.uniform(50, category_limits[category])
        else:
            # Generate a normal case
            category = random.choice(dept_categories[department])
            amount = random.uniform(50, category_limits[category] * 0.8)
        
        # Add more realistic details
        project_code = f'PRJ{random.randint(100,999)}' if random.random() > 0.7 else None
        receipt_number = f'RCP{random.randint(10000,99999)}' if random.random() > 0.1 else None
        
        record = {
            'expense_id': f'EXP{i:06d}',
            'employee_id': employee_id,
            'department': department,
            'submission_date': submission_date.strftime('%Y-%m-%d'),
            'expense_date': expense_date.strftime('%Y-%m-%d'),
            'amount': round(amount, 2),
            'currency': 'USD',
            'vendor': random.choice(vendors),
            'category': category,
            'payment_method': random.choice(payment_methods),
            'has_justification': random.random() > 0.1,  # 10% chance of missing justification
            'project_code': project_code,
            'receipt_number': receipt_number,
            'approval_status': random.choice(['Pending', 'Approved', 'Rejected']) if amount > 1000 else 'Approved',
            'violation_type': violation_type if is_violation else None,
            'is_violation': is_violation
        }
        data.append(record)
    
    df = pd.DataFrame(data)
    
    # Add some basic data validation
    df['days_to_submit'] = (pd.to_datetime(df['submission_date']) - 
                           pd.to_datetime(df['expense_date'])).dt.days
    
    # Generate summary statistics
    summary_stats = {
        'total_records': len(df),
        'total_amount': df['amount'].sum(),
        'average_amount': df['amount'].mean(),
        'violation_rate': (df['is_violation'].sum() / len(df)) * 100,
        'violations_by_type': df[df['is_violation']]['violation_type'].value_counts().to_dict(),
        'expenses_by_department': df.groupby('department')['amount'].sum().to_dict(),
        'average_submission_delay': df['days_to_submit'].mean()
    }
    
    # Save to CSV
    df.to_csv('synthetic_expense_data.csv', index=False)
    
    # Print statistics
    print("\nDataset Statistics:")
    print(f"Total Records: {summary_stats['total_records']}")
    print(f"Total Amount: ${summary_stats['total_amount']:,.2f}")
    print(f"Average Amount: ${summary_stats['average_amount']:.2f}")
    print(f"Violation Rate: {summary_stats['violation_rate']:.2f}%")
    print("\nViolations by Type:")
    for vtype, count in summary_stats['violations_by_type'].items():
        print(f"  {vtype}: {count}")
    print("\nExpenses by Department:")
    for dept, amount in summary_stats['expenses_by_department'].items():
        print(f"  {dept}: ${amount:,.2f}")
    print(f"\nAverage Submission Delay: {summary_stats['average_submission_delay']:.1f} days")
    print("\nSample Records:")
    print(df.head())
    
    return df, summary_stats

if __name__ == "__main__":
    # Generate synthetic data with default parameters
    df, stats = generate_synthetic_expense_data()
    
    # You can also generate with custom parameters
    # df, stats = generate_synthetic_expense_data(num_records=2000, num_employees=100)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
import xgboost as xgb
from datetime import datetime
import joblib

class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    """Custom transformer for extracting date-based features"""
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        
        # Convert date strings to datetime
        X['submission_date'] = pd.to_datetime(X['submission_date'])
        X['expense_date'] = pd.to_datetime(X['expense_date'])
        
        # Extract date features
        X['submission_delay'] = (X['submission_date'] - X['expense_date']).dt.days
        X['day_of_week'] = X['expense_date'].dt.dayofweek
        X['is_weekend'] = X['day_of_week'].isin([5, 6]).astype(int)
        X['month'] = X['expense_date'].dt.month
        X['quarter'] = X['expense_date'].dt.quarter
        X['is_month_end'] = X['expense_date'].dt.is_month_end.astype(int)
        
        # Drop original date columns
        X = X.drop(['submission_date', 'expense_date'], axis=1)
        
        return X

class AdvancedExpenseDetector:
    def __init__(self):
        self.models = {}
        self.preprocessor = None
        self.feature_importances_ = None
        
    def prepare_data(self, df):
        """Prepare data for training/prediction"""
        df = df.copy()
        
        # Define feature columns
        numeric_features = [
            'amount', 'amount_local', 'vendor_risk_score', 'receipt_quality',
            'ocr_confidence', 'num_attendees', 'employee_risk_score', 'previous_violations'
        ]
        
        categorical_features = [
            'department', 'seniority', 'category', 'currency', 'vendor_country',
            'payment_method', 'cost_center'
        ]
        
        date_features = ['submission_date', 'expense_date']
        
        binary_features = [
            'requires_approval', 'high_risk_category', 'has_receipt',
            'manual_review_required'
        ]
        
        # Create preprocessing pipeline
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numeric_features),
                ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
                ('date', DateFeatureExtractor(), date_features),
                ('bin', 'passthrough', binary_features)
            ])
        
        return df
    
    def train_models(self, df, target_column='is_violation'):
        """Train multiple models for expense violation detection"""
        # Prepare data
        df = self.prepare_data(df)
        
        # Remove unnecessary columns
        columns_to_drop = ['expense_id', 'employee_id', 'vendor_id', 'vendor_name', 
                          'project_code', 'notes', 'approval_status', 'approval_date']
        features = df.drop(columns_to_drop + [target_column], axis=1, errors='ignore')
        target = df[target_column]
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            features, target, test_size=0.2, random_state=42, stratify=target
        )
        
        # Initialize models
        models = {
            'random_forest': Pipeline([
                ('preprocessor', self.preprocessor),
                ('classifier', RandomForestClassifier(n_estimators=100, 
                                                    max_depth=10,
                                                    class_weight='balanced',
                                                    random_state=42))
            ]),
            'gradient_boosting': Pipeline([
                ('preprocessor', self.preprocessor),
                ('classifier', GradientBoostingClassifier(n_estimators=100,
                                                        learning_rate=0.1,
                                                        max_depth=5,
                                                        random_state=42))
            ]),
            'xgboost': Pipeline([
                ('preprocessor', self.preprocessor),
                ('classifier', xgb.XGBClassifier(n_estimators=100,
                                               learning_rate=0.1,
                                               max_depth=5,
                                               random_state=42))
            ])
        }
        
        # Train and evaluate each model
        results = {}
        for name, model in models.items():
            print(f"\nTraining {name}...")
            model.fit(X_train, y_train)
            
            # Make predictions
            y_pred = model.predict(X_test)
            
            # Calculate metrics
            results[name] = {
                'classification_report': classification_report(y_test, y_pred),
                'confusion_matrix': confusion_matrix(y_test, y_pred),
                'model': model
            }
            
            print(f"\n{name} Results:")
            print(results[name]['classification_report'])
        
        # Store the best model
        best_model = max(results.items(), 
                        key=lambda x: float(x[1]['classification_report'].split('\n')[-2].split()[-2]))
        self.models['best_model'] = best_model[1]['model']
        
        # Store feature importances if available
        if hasattr(self.models['best_model'].named_steps['classifier'], 'feature_importances_'):
            feature_names = (numeric_features + 
                           self.preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features).tolist() +
                           ['submission_delay', 'day_of_week', 'is_weekend', 'month', 'quarter', 'is_month_end'] +
                           binary_features)
            
            self.feature_importances_ = pd.DataFrame({
                'feature': feature_names,
                'importance': self.models['best_model'].named_steps['classifier'].feature_importances_
            }).sort_values('importance', ascending=False)
        
        return results
    
    def predict(self, df):
        """Make predictions on new data"""
        if not self.models.get('best_model'):
            raise ValueError("Model has not been trained yet. Call train_models() first.")
        
        # Prepare data
        df = self.prepare_data(df)
        
        # Remove unnecessary columns
        columns_to_drop = ['expense_id', 'employee_id', 'vendor_id', 'vendor_name', 
                          'project_code', 'notes', 'approval_status', 'approval_date']
        features = df.drop(columns_to_drop + ['is_violation'], axis=1, errors='ignore')
        
        # Make predictions
        predictions = self.models['best_model'].predict(features)
        probabilities = self.models['best_model'].predict_proba(features)
        
        return predictions, probabilities
    
    def get_feature_importance(self):
        """Return feature importance analysis"""
        if self.feature_importances_ is None:
            raise ValueError("Feature importances not available. Train the model first.")
        return self.feature_importances_
    
    def save_model(self, filepath):
        """Save the trained model"""
        if not self.models.get('best_model'):
            raise ValueError("No trained model to save")
        joblib.dump(self.models['best_model'], filepath)
    
    def load_model(self, filepath):
        """Load a trained model"""
        self.models['best_model'] = joblib.load(filepath)
        return self

# Example usage
if __name__ == "__main__":
    # Load your data
    df = pd.read_csv('synthetic_expense_data.csv')
    
    # Initialize and train the detector
    detector = AdvancedExpenseDetector()
    results = detector.train_models(df)
    
    # Save the best model
    detector.save_model('expense_detector_model.joblib')
    
    # Get feature importance
    try:
        importance_df = detector.get_feature_importance()
        print("\nTop 10 Most Important Features:")
        print(importance_df.head(10))
    except:
        print("\nFeature importance not available for this model")
    
    # Make predictions on new data
    predictions, probabilities = detector.predict(df.head())
    print("\nSample Predictions:")
    for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
        print(f"Record {i+1}: {'Violation' if pred else 'Normal'} (Confidence: {max(prob)*100:.2f}%)")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
import xgboost as xgb
from datetime import datetime
import joblib

class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    """Custom transformer for extracting date-based features."""
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # Convert date strings to datetime
        X['submission_date'] = pd.to_datetime(X['submission_date'])
        X['expense_date'] = pd.to_datetime(X['expense_date'])
        
        # Extract date features
        X['submission_delay'] = (X['submission_date'] - X['expense_date']).dt.days
        X['day_of_week'] = X['expense_date'].dt.dayofweek
        X['is_weekend'] = X['day_of_week'].isin([5, 6]).astype(int)
        X['month'] = X['expense_date'].dt.month
        X['quarter'] = X['expense_date'].dt.quarter
        X['is_month_end'] = X['expense_date'].dt.is_month_end.astype(int)
        
        # Drop the original date columns
        X = X.drop(['submission_date', 'expense_date'], axis=1)
        return X

class AdvancedExpenseDetector:
    def __init__(self):
        self.models = {}
        self.preprocessor = None
        self.feature_importances_ = None
        
        # Store feature lists as instance variables for later use
        self.numeric_features = []
        self.categorical_features = []
        self.date_features = []
        self.binary_features = []
    
    def prepare_data(self, df):
        """Prepare data for training/prediction by creating a preprocessing pipeline."""
        df = df.copy()
        
        # Define all potential numeric features
        all_numeric_features = [
            'amount', 'amount_local', 'vendor_risk_score', 'receipt_quality',
            'ocr_confidence', 'num_attendees', 'employee_risk_score', 'previous_violations'
        ]
        # Only keep those columns that actually exist in the DataFrame
        self.numeric_features = [feat for feat in all_numeric_features if feat in df.columns]
        
        # Define categorical features (assumes these columns exist in your data)
        self.categorical_features = [
            'department', 'seniority', 'category', 'currency', 'vendor_country',
            'payment_method', 'cost_center'
        ]
        
        # Define date features (must be present for date processing)
        self.date_features = ['submission_date', 'expense_date']
        
        # Define binary features (again, check these exist in your data)
        self.binary_features = [
            'requires_approval', 'high_risk_category', 'has_receipt',
            'manual_review_required'
        ]
        
        # Create the preprocessing pipeline
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), self.numeric_features),
                ('cat', OneHotEncoder(handle_unknown='ignore'), self.categorical_features),
                ('date', DateFeatureExtractor(), self.date_features),
                ('bin', 'passthrough', self.binary_features)
            ]
        )
        
        return df
    
    def train_models(self, df, target_column='is_violation'):
        """Train multiple models for expense violation detection."""
        # Prepare the data and build the preprocessor
        df = self.prepare_data(df)
        
        # Remove unnecessary columns
        columns_to_drop = ['expense_id', 'employee_id', 'vendor_id', 'vendor_name', 
                           'project_code', 'notes', 'approval_status', 'approval_date']
        features = df.drop(columns=columns_to_drop + [target_column], axis=1, errors='ignore')
        target = df[target_column]
        
        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(
            features, target, test_size=0.2, random_state=42, stratify=target
        )
        
        # Initialize the models within pipelines
        models = {
            'random_forest': Pipeline([
                ('preprocessor', self.preprocessor),
                ('classifier', RandomForestClassifier(n_estimators=100, 
                                                        max_depth=10,
                                                        class_weight='balanced',
                                                        random_state=42))
            ]),
            'gradient_boosting': Pipeline([
                ('preprocessor', self.preprocessor),
                ('classifier', GradientBoostingClassifier(n_estimators=100,
                                                          learning_rate=0.1,
                                                          max_depth=5,
                                                          random_state=42))
            ]),
            'xgboost': Pipeline([
                ('preprocessor', self.preprocessor),
                ('classifier', xgb.XGBClassifier(n_estimators=100,
                                                 learning_rate=0.1,
                                                 max_depth=5,
                                                 random_state=42))
            ])
        }
        
        # Train each model and evaluate performance
        results = {}
        for name, model in models.items():
            print(f"\nTraining {name}...")
            model.fit(X_train, y_train)
            
            # Make predictions on the test set
            y_pred = model.predict(X_test)
            
            # Calculate metrics and store results
            results[name] = {
                'classification_report': classification_report(y_test, y_pred),
                'confusion_matrix': confusion_matrix(y_test, y_pred),
                'model': model
            }
            
            print(f"\n{name} Results:")
            print(results[name]['classification_report'])
        
        # Store the best model (here, we simply choose the one with highest overall F1 score)
        # Note: Adjust the selection criteria as needed.
        best_model = max(results.items(), key=lambda x: float(x[1]['classification_report'].split()[-2]))
        self.models['best_model'] = best_model[1]['model']
        
        # Compute feature importances if the classifier supports it
        classifier = self.models['best_model'].named_steps['classifier']
        if hasattr(classifier, 'feature_importances_'):
            # Get one-hot encoder feature names
            cat_feature_names = self.preprocessor.named_transformers_['cat'] \
                                .get_feature_names_out(self.categorical_features).tolist()
            # Combine all feature names in the order they are passed to the classifier
            feature_names = (self.numeric_features +
                             cat_feature_names +
                             ['submission_delay', 'day_of_week', 'is_weekend', 'month', 'quarter', 'is_month_end'] +
                             self.binary_features)
            
            self.feature_importances_ = pd.DataFrame({
                'feature': feature_names,
                'importance': classifier.feature_importances_
            }).sort_values('importance', ascending=False)
        
        return results
    
    def predict(self, df):
        """Make predictions on new data."""
        if 'best_model' not in self.models:
            raise ValueError("Model has not been trained yet. Call train_models() first.")
        
        # Prepare data using the same preprocessor
        df = self.prepare_data(df)
        columns_to_drop = ['expense_id', 'employee_id', 'vendor_id', 'vendor_name', 
                           'project_code', 'notes', 'approval_status', 'approval_date']
        features = df.drop(columns=columns_to_drop + ['is_violation'], axis=1, errors='ignore')
        
        predictions = self.models['best_model'].predict(features)
        probabilities = self.models['best_model'].predict_proba(features)
        return predictions, probabilities
    
    def get_feature_importance(self):
        """Return feature importance analysis."""
        if self.feature_importances_ is None:
            raise ValueError("Feature importances not available. Train the model first.")
        return self.feature_importances_
    
    def save_model(self, filepath):
        """Save the trained model to disk."""
        if 'best_model' not in self.models:
            raise ValueError("No trained model to save.")
        joblib.dump(self.models['best_model'], filepath)
    
    def load_model(self, filepath):
        """Load a trained model from disk."""
        self.models['best_model'] = joblib.load(filepath)
        return self

# Example usage:
if __name__ == "__main__":
    # Load your synthetic data CSV file
    df = pd.read_csv('synthetic_expense_data.csv')
    
    # Initialize and train the detector
    detector = AdvancedExpenseDetector()
    results = detector.train_models(df)
    
    # Save the best model
    detector.save_model('expense_detector_model.joblib')
    
    # Get and print feature importances if available
    try:
        importance_df = detector.get_feature_importance()
        print("\nTop 10 Most Important Features:")
        print(importance_df.head(10))
    except Exception as e:
        print("\nFeature importance not available:", e)
    
    # Make predictions on a sample of the data
    predictions, probabilities = detector.predict(df.head())
    print("\nSample Predictions:")
    for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
        label = 'Violation' if pred else 'Normal'
        confidence = max(prob) * 100
        print(f"Record {i+1}: {label} (Confidence: {confidence:.2f}%)")


In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
import xgboost as xgb
import joblib

# ----------------------------
# Custom Transformer: DateFeatureExtractor
# ----------------------------
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    """Extract date-based features from submission_date and expense_date."""
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # Convert date strings to datetime
        X['submission_date'] = pd.to_datetime(X['submission_date'])
        X['expense_date'] = pd.to_datetime(X['expense_date'])
        
        # Extract date-based features
        X['submission_delay'] = (X['submission_date'] - X['expense_date']).dt.days
        X['day_of_week'] = X['expense_date'].dt.dayofweek
        X['is_weekend'] = X['day_of_week'].isin([5, 6]).astype(int)
        X['month'] = X['expense_date'].dt.month
        X['quarter'] = X['expense_date'].dt.quarter
        X['is_month_end'] = X['expense_date'].dt.is_month_end.astype(int)
        
        # Drop the original date columns
        X = X.drop(['submission_date', 'expense_date'], axis=1)
        return X

# ----------------------------
# Detector Class: AdvancedExpenseDetector
# ----------------------------
class AdvancedExpenseDetector:
    def __init__(self):
        self.models = {}
        self.preprocessor = None
        self.feature_importances_ = None

        # Initialize feature lists (will be filtered based on input DataFrame)
        self.numeric_features = []
        self.categorical_features = []
        self.date_features = []
        self.binary_features = []

    def prepare_data(self, df):
        """Prepare data by setting up the preprocessing pipeline, filtering out missing columns."""
        df = df.copy()

        # Define potential feature lists
        potential_numeric_features = [
            'amount', 'amount_local', 'vendor_risk_score', 'receipt_quality',
            'ocr_confidence', 'num_attendees', 'employee_risk_score', 'previous_violations'
        ]
        potential_categorical_features = [
            'department', 'seniority', 'category', 'currency', 'vendor_country',
            'payment_method', 'cost_center'
        ]
        potential_date_features = ['submission_date', 'expense_date']
        potential_binary_features = [
            'requires_approval', 'high_risk_category', 'has_receipt', 'manual_review_required'
        ]

        # Filter features based on the columns present in the DataFrame
        self.numeric_features = [col for col in potential_numeric_features if col in df.columns]
        self.categorical_features = [col for col in potential_categorical_features if col in df.columns]
        self.date_features = [col for col in potential_date_features if col in df.columns]
        self.binary_features = [col for col in potential_binary_features if col in df.columns]

        # Create the preprocessing pipeline
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), self.numeric_features),
                ('cat', OneHotEncoder(handle_unknown='ignore'), self.categorical_features),
                ('date', DateFeatureExtractor(), self.date_features),
                ('bin', 'passthrough', self.binary_features)
            ]
        )
        return df

    def train_models(self, df, target_column='is_violation'):
        """Train multiple models to detect expense violations."""
        # Prepare the data and set up the preprocessor
        df = self.prepare_data(df)

        # Drop columns that are not used as features
        columns_to_drop = ['expense_id', 'employee_id', 'vendor_id', 'vendor_name', 
                           'project_code', 'notes', 'approval_status', 'approval_date']
        features = df.drop(columns=columns_to_drop + [target_column], axis=1, errors='ignore')
        target = df[target_column]

        # Split the dataset into training and testing subsets
        X_train, X_test, y_train, y_test = train_test_split(
            features, target, test_size=0.2, random_state=42, stratify=target
        )

        # Define models inside pipelines
        models = {
            'random_forest': Pipeline([
                ('preprocessor', self.preprocessor),
                ('classifier', RandomForestClassifier(n_estimators=100, 
                                                        max_depth=10,
                                                        class_weight='balanced',
                                                        random_state=42))
            ]),
            'gradient_boosting': Pipeline([
                ('preprocessor', self.preprocessor),
                ('classifier', GradientBoostingClassifier(n_estimators=100,
                                                          learning_rate=0.1,
                                                          max_depth=5,
                                                          random_state=42))
            ]),
            'xgboost': Pipeline([
                ('preprocessor', self.preprocessor),
                ('classifier', xgb.XGBClassifier(n_estimators=100,
                                                 learning_rate=0.1,
                                                 max_depth=5,
                                                 use_label_encoder=False,
                                                 eval_metric='logloss',
                                                 random_state=42))
            ])
        }

        results = {}
        # Train and evaluate each model
        for name, model in models.items():
            print(f"\nTraining {name}...")
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            results[name] = {
                'classification_report': classification_report(y_test, y_pred),
                'confusion_matrix': confusion_matrix(y_test, y_pred),
                'model': model
            }
            print(f"\n{name} Results:")
            print(results[name]['classification_report'])

        # Choose the best model (here, we simply select the first one)
        self.models['best_model'] = models['random_forest']

        # If the classifier supports feature importances, compute them.
        classifier = self.models['best_model'].named_steps['classifier']
        if hasattr(classifier, 'feature_importances_'):
            cat_feature_names = []
            if self.categorical_features:
                cat_feature_names = self.preprocessor.named_transformers_['cat'].get_feature_names_out(self.categorical_features).tolist()
            # Combine feature names (order must match transformation order)
            feature_names = (
                self.numeric_features +
                cat_feature_names +
                ['submission_delay', 'day_of_week', 'is_weekend', 'month', 'quarter', 'is_month_end'] +
                self.binary_features
            )
            self.feature_importances_ = pd.DataFrame({
                'feature': feature_names,
                'importance': classifier.feature_importances_
            }).sort_values('importance', ascending=False)

        return results

    def predict(self, df):
        """Make predictions on new data using the best model."""
        if 'best_model' not in self.models:
            raise ValueError("Model has not been trained yet. Call train_models() first.")
        df = self.prepare_data(df)
        columns_to_drop = ['expense_id', 'employee_id', 'vendor_id', 'vendor_name', 
                           'project_code', 'notes', 'approval_status', 'approval_date']
        features = df.drop(columns=columns_to_drop + ['is_violation'], axis=1, errors='ignore')
        predictions = self.models['best_model'].predict(features)
        probabilities = self.models['best_model'].predict_proba(features)
        return predictions, probabilities

    def get_feature_importance(self):
        """Return the computed feature importances (if available)."""
        if self.feature_importances_ is None:
            raise ValueError("Feature importances not available. Train the model first.")
        return self.feature_importances_

    def save_model(self, filepath):
        """Save the trained best model to disk."""
        if 'best_model' not in self.models:
            raise ValueError("No trained model to save.")
        joblib.dump(self.models['best_model'], filepath)

    def load_model(self, filepath):
        """Load a trained model from disk."""
        self.models['best_model'] = joblib.load(filepath)
        return self

# ----------------------------
# Synthetic Data Generation for Testing
# ----------------------------
def generate_synthetic_data(n_samples=100):
    np.random.seed(42)
    data = {}
    
    # ID and employee columns
    data['expense_id'] = np.arange(1, n_samples + 1)
    data['employee_id'] = np.random.randint(1000, 2000, n_samples)
    
    # Numeric features
    data['amount'] = np.random.uniform(10, 1000, n_samples).round(2)
    # For demonstration, we include amount_local here; you can remove it to test missing column handling.
    data['amount_local'] = (data['amount'] * np.random.uniform(0.9, 1.1, n_samples)).round(2)
    data['vendor_risk_score'] = np.random.uniform(0, 1, n_samples).round(2)
    data['receipt_quality'] = np.random.uniform(0, 1, n_samples).round(2)
    data['ocr_confidence'] = np.random.uniform(0, 1, n_samples).round(2)
    data['num_attendees'] = np.random.randint(1, 10, n_samples)
    data['employee_risk_score'] = np.random.uniform(0, 1, n_samples).round(2)
    data['previous_violations'] = np.random.randint(0, 5, n_samples)
    
    # Categorical features (intentionally leaving out "seniority" to test our fix)
    data['department'] = np.random.choice(['Sales', 'Engineering', 'HR', 'Marketing'], n_samples)
    # data['seniority'] is intentionally omitted to simulate a missing column.
    data['category'] = np.random.choice(['Travel', 'Meals', 'Supplies'], n_samples)
    data['currency'] = np.random.choice(['USD', 'EUR'], n_samples)
    data['vendor_country'] = np.random.choice(['US', 'FR', 'DE'], n_samples)
    data['payment_method'] = np.random.choice(['Credit Card', 'Cash', 'Wire Transfer'], n_samples)
    data['cost_center'] = np.random.choice(['A1', 'B2', 'C3'], n_samples)
    
    # Date features
    base_date = datetime.today()
    data['expense_date'] = [(base_date - timedelta(days=np.random.randint(1, 30))).strftime('%Y-%m-%d') for _ in range(n_samples)]
    data['submission_date'] = [(datetime.strptime(exp_date, '%Y-%m-%d') + timedelta(days=np.random.randint(0, 10))).strftime('%Y-%m-%d') for exp_date in data['expense_date']]
    
    # Binary features
    data['requires_approval'] = np.random.choice([0, 1], n_samples)
    data['high_risk_category'] = np.random.choice([0, 1], n_samples)
    data['has_receipt'] = np.random.choice([0, 1], n_samples)
    data['manual_review_required'] = np.random.choice([0, 1], n_samples)
    
    # Other non-feature columns
    data['vendor_id'] = np.random.randint(2000, 3000, n_samples)
    data['vendor_name'] = np.random.choice(['VendorA', 'VendorB', 'VendorC'], n_samples)
    data['project_code'] = np.random.choice(['P100', 'P200', 'P300'], n_samples)
    data['notes'] = [''] * n_samples
    data['approval_status'] = np.random.choice(['Approved', 'Rejected'], n_samples)
    data['approval_date'] = [base_date.strftime('%Y-%m-%d')] * n_samples
    
    # Target column
    data['is_violation'] = np.random.choice([0, 1], n_samples, p=[0.8, 0.2])
    
    df = pd.DataFrame(data)
    return df

# ----------------------------
# Main Execution
# ----------------------------
if __name__ == "__main__":
    # Generate synthetic data
    df = generate_synthetic_data(n_samples=200)
    print("Synthetic data sample:")
    print(df.head(), "\n")
    
    # Initialize and train the detector
    detector = AdvancedExpenseDetector()
    results = detector.train_models(df)
    
    # Optionally, save the best model
    detector.save_model('expense_detector_model.joblib')
    
    # Print feature importances if available
    try:
        importance_df = detector.get_feature_importance()
        print("\nTop 10 Most Important Features:")
        print(importance_df.head(10))
    except Exception as e:
        print("\nFeature importance not available:", e)
    
    # Make predictions on a sample of the data
    predictions, probabilities = detector.predict(df.head())
    print("\nSample Predictions:")
    for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
        label = 'Violation' if pred else 'Normal'
        confidence = max(prob) * 100
        print(f"Record {i+1}: {label} (Confidence: {confidence:.2f}%)")


Synthetic data sample:
   expense_id  employee_id  amount  amount_local  vendor_risk_score  \
0           1         1102  698.56        648.03               0.60   
1           2         1435  147.94        137.07               0.27   
2           3         1860  608.37        665.50               0.13   
3           4         1270  544.44        567.81               0.08   
4           5         1106  211.03        191.66               0.94   

   receipt_quality  ocr_confidence  num_attendees  employee_risk_score  \
0             0.55            0.64              4                 0.04   
1             0.55            0.70              1                 0.67   
2             0.20            0.91              3                 0.95   
3             0.68            0.62              2                 0.12   
4             0.09            0.34              2                 0.90   

   previous_violations  ... high_risk_category has_receipt  \
0                    1  ...                

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



gradient_boosting Results:
              precision    recall  f1-score   support

           0       0.79      0.97      0.87        32
           1       0.00      0.00      0.00         8

    accuracy                           0.78        40
   macro avg       0.40      0.48      0.44        40
weighted avg       0.64      0.78      0.70        40


Training xgboost...

xgboost Results:
              precision    recall  f1-score   support

           0       0.78      0.91      0.84        32
           1       0.00      0.00      0.00         8

    accuracy                           0.72        40
   macro avg       0.39      0.45      0.42        40
weighted avg       0.63      0.72      0.67        40


Top 10 Most Important Features:
                feature  importance
2     vendor_risk_score    0.090091
1          amount_local    0.087846
6   employee_risk_score    0.082312
0                amount    0.082061
3       receipt_quality    0.078574
4        ocr_confidence    0.0

Parameters: { "use_label_encoder" } are not used.



In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
import joblib

# ----------------------------
# Custom Transformer for Date Features
# ----------------------------
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    """Extract date-based features from submission_date and expense_date."""
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X['submission_date'] = pd.to_datetime(X['submission_date'])
        X['expense_date'] = pd.to_datetime(X['expense_date'])
        
        # Create additional features
        X['submission_delay'] = (X['submission_date'] - X['expense_date']).dt.days
        X['day_of_week'] = X['expense_date'].dt.dayofweek
        X['is_weekend'] = X['day_of_week'].isin([5, 6]).astype(int)
        X['month'] = X['expense_date'].dt.month
        X['quarter'] = X['expense_date'].dt.quarter
        X['is_month_end'] = X['expense_date'].dt.is_month_end.astype(int)
        
        # Drop original date columns
        X = X.drop(['submission_date', 'expense_date'], axis=1)
        return X

# ----------------------------
# Advanced Expense Detector Class
# ----------------------------
class AdvancedExpenseDetector:
    def __init__(self):
        self.models = {}
        self.preprocessor = None
        self.feature_importances_ = None

        # Define the potential input features (if available in the data)
        self.potential_numeric = [
            'amount', 'amount_local', 'vendor_risk_score', 'receipt_quality',
            'ocr_confidence', 'num_attendees', 'employee_risk_score', 'previous_violations'
        ]
        self.potential_categorical = [
            'department', 'seniority', 'category', 'currency', 'vendor_country',
            'payment_method', 'cost_center'
        ]
        self.potential_date = ['submission_date', 'expense_date']
        self.potential_binary = [
            'requires_approval', 'high_risk_category', 'has_receipt', 'manual_review_required'
        ]
        
    def list_required_inputs(self):
        """
        Return a dictionary of required inputs by type.
        (Columns will be used only if present in the input DataFrame.)
        """
        return {
            'numeric': self.potential_numeric,
            'categorical': self.potential_categorical,
            'date': self.potential_date,
            'binary': self.potential_binary
        }
    
    def prepare_data(self, df):
        """Prepare data by filtering available columns and setting up the preprocessor."""
        df = df.copy()
        
        # Filter the columns that exist in the DataFrame.
        self.numeric_features = [col for col in self.potential_numeric if col in df.columns]
        self.categorical_features = [col for col in self.potential_categorical if col in df.columns]
        self.date_features = [col for col in self.potential_date if col in df.columns]
        self.binary_features = [col for col in self.potential_binary if col in df.columns]
        
        # Create the preprocessor pipeline
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), self.numeric_features),
                ('cat', OneHotEncoder(handle_unknown='ignore'), self.categorical_features),
                ('date', DateFeatureExtractor(), self.date_features),
                ('bin', 'passthrough', self.binary_features)
            ]
        )
        return df

    def train_models(self, df, target_column='is_violation'):
        """Train a Random Forest model for expense violation detection."""
        # Prepare data (and build the preprocessor)
        df = self.prepare_data(df)
        
        # Drop columns that are not features
        columns_to_drop = ['expense_id', 'employee_id', 'vendor_id', 'vendor_name', 
                           'project_code', 'notes', 'approval_status', 'approval_date']
        features = df.drop(columns=columns_to_drop + [target_column], axis=1, errors='ignore')
        target = df[target_column]
        
        # Split into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(
            features, target, test_size=0.2, random_state=42, stratify=target
        )
        
        # Define a Random Forest pipeline
        model = Pipeline([
            ('preprocessor', self.preprocessor),
            ('classifier', RandomForestClassifier(n_estimators=100, max_depth=10,
                                                    class_weight='balanced', random_state=42))
        ])
        
        print("Training Random Forest...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print("\nClassification Report on Test Data:")
        print(classification_report(y_test, y_pred))
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
        
        # Save this model as the best model
        self.models['best_model'] = model
        
        # Save feature importances if available
        classifier = self.models['best_model'].named_steps['classifier']
        if hasattr(classifier, 'feature_importances_'):
            cat_feature_names = []
            if self.categorical_features:
                cat_feature_names = self.preprocessor.named_transformers_['cat'] \
                                        .get_feature_names_out(self.categorical_features).tolist()
            # Combine feature names in the order they appear to the classifier
            feature_names = (self.numeric_features +
                             cat_feature_names +
                             ['submission_delay', 'day_of_week', 'is_weekend', 'month', 'quarter', 'is_month_end'] +
                             self.binary_features)
            self.feature_importances_ = pd.DataFrame({
                'feature': feature_names,
                'importance': classifier.feature_importances_
            }).sort_values('importance', ascending=False)
        
        return model
    
    def save_model(self, filepath):
        """Save the trained best model to disk."""
        if 'best_model' not in self.models:
            raise ValueError("No trained model to save.")
        joblib.dump(self.models['best_model'], filepath)
        print(f"Model saved to {filepath}")
    
    def load_model(self, filepath):
        """Load a trained model from disk."""
        self.models['best_model'] = joblib.load(filepath)
        print(f"Model loaded from {filepath}")
        return self
    
    def predict(self, df):
        """Predict on a DataFrame (can be multiple rows)."""
        if 'best_model' not in self.models:
            raise ValueError("Model has not been trained yet. Call train_models() first.")
        df = self.prepare_data(df)
        columns_to_drop = ['expense_id', 'employee_id', 'vendor_id', 'vendor_name', 
                           'project_code', 'notes', 'approval_status', 'approval_date']
        features = df.drop(columns=columns_to_drop + ['is_violation'], axis=1, errors='ignore')
        predictions = self.models['best_model'].predict(features)
        probabilities = self.models['best_model'].predict_proba(features)
        return predictions, probabilities
    
    def predict_single(self, new_row):
        """
        Accept a single expense report (as a dict), predict if it's a violation,
        and return a compliance report.
        """
        # Convert the dict into a DataFrame (with one row)
        df_single = pd.DataFrame([new_row])
        pred, prob = self.predict(df_single)
        prediction = pred[0]
        probability = max(prob[0])  # highest confidence score
        
        report = self.get_compliance_report(new_row, prediction, probability)
        return report
    
    def get_compliance_report(self, input_data, prediction, probability):
        """
        Generate a compliance report based on the input data and model prediction.
        (Here we simply flag based on the predicted class.)
        """
        expense_id = input_data.get('expense_id', 'N/A')
        employee_id = input_data.get('employee_id', 'N/A')
        department = input_data.get('department', 'N/A')
        category = input_data.get('category', 'N/A')
        amount = input_data.get('amount', 'N/A')
        
        # Create a basic report.
        report = f"\nPolicy Compliance Report\n"
        report += f"* Expense ID: {expense_id}\n"
        report += f"* Employee ID: {employee_id} | Department: {department}\n"
        report += f"* Expense Category & Amount: {category} - {amount}\n"
        
        if prediction == 1:
            report += "* Detected Violations:\n"
            report += "   - Violation Detected (Model flagged this expense)\n"
            report += "* Suggested Actions:\n"
            report += "   - Please review the expense policy and provide necessary justification.\n"
            report += "   - Manager review required.\n"
        else:
            report += "* No Violations Detected.\n"
        
        report += f"* Model Confidence: {probability*100:.2f}%\n"
        return report

# ----------------------------
# Synthetic Data Generation (for training)
# ----------------------------
def generate_synthetic_data(n_samples=200):
    np.random.seed(42)
    data = {}
    
    # IDs and basic information
    data['expense_id'] = np.arange(1, n_samples + 1)
    data['employee_id'] = np.random.randint(1000, 2000, n_samples)
    
    # Numeric features
    data['amount'] = np.random.uniform(10, 1000, n_samples).round(2)
    data['amount_local'] = (data['amount'] * np.random.uniform(0.9, 1.1, n_samples)).round(2)
    data['vendor_risk_score'] = np.random.uniform(0, 1, n_samples).round(2)
    data['receipt_quality'] = np.random.uniform(0, 1, n_samples).round(2)
    data['ocr_confidence'] = np.random.uniform(0, 1, n_samples).round(2)
    data['num_attendees'] = np.random.randint(1, 10, n_samples)
    data['employee_risk_score'] = np.random.uniform(0, 1, n_samples).round(2)
    data['previous_violations'] = np.random.randint(0, 5, n_samples)
    
    # Categorical features (note: "seniority" is intentionally omitted to test handling)
    data['department'] = np.random.choice(['Sales', 'Engineering', 'HR', 'Marketing'], n_samples)
    data['category'] = np.random.choice(['Travel', 'Meals', 'Supplies'], n_samples)
    data['currency'] = np.random.choice(['USD', 'EUR'], n_samples)
    data['vendor_country'] = np.random.choice(['US', 'FR', 'DE'], n_samples)
    data['payment_method'] = np.random.choice(['Credit Card', 'Cash', 'Wire Transfer'], n_samples)
    data['cost_center'] = np.random.choice(['A1', 'B2', 'C3'], n_samples)
    
    # Date features
    base_date = datetime.today()
    data['expense_date'] = [(base_date - timedelta(days=np.random.randint(1, 30))).strftime('%Y-%m-%d')
                            for _ in range(n_samples)]
    data['submission_date'] = [(datetime.strptime(exp_date, '%Y-%m-%d') +
                                timedelta(days=np.random.randint(0, 10))).strftime('%Y-%m-%d')
                               for exp_date in data['expense_date']]
    
    # Binary features
    data['requires_approval'] = np.random.choice([0, 1], n_samples)
    data['high_risk_category'] = np.random.choice([0, 1], n_samples)
    data['has_receipt'] = np.random.choice([0, 1], n_samples)
    data['manual_review_required'] = np.random.choice([0, 1], n_samples)
    
    # Other non-feature columns
    data['vendor_id'] = np.random.randint(2000, 3000, n_samples)
    data['vendor_name'] = np.random.choice(['VendorA', 'VendorB', 'VendorC'], n_samples)
    data['project_code'] = np.random.choice(['P100', 'P200', 'P300'], n_samples)
    data['notes'] = [''] * n_samples
    data['approval_status'] = np.random.choice(['Approved', 'Rejected'], n_samples)
    data['approval_date'] = [base_date.strftime('%Y-%m-%d')] * n_samples
    
    # Target column: 0 = Normal, 1 = Violation (imbalance: ~80% Normal)
    data['is_violation'] = np.random.choice([0, 1], n_samples, p=[0.8, 0.2])
    
    return pd.DataFrame(data)

# ----------------------------
# Main Execution
# ----------------------------
if __name__ == "__main__":
    # 1. List Required Inputs
    detector = AdvancedExpenseDetector()
    req_inputs = detector.list_required_inputs()
    print("Required Inputs by Type:")
    for key, val in req_inputs.items():
        print(f"  {key.capitalize()}: {val}")
    print("\n" + "-"*50 + "\n")
    
    # 2. Train the model on synthetic data
    df_train = generate_synthetic_data(n_samples=200)
    detector.train_models(df_train)
    
    # 3. Save the trained model to disk
    detector.save_model('expense_detector_model.joblib')
    
    # 4. Example: Predict on a single new expense row
    new_expense = {
        'expense_id': 201,
        'employee_id': 1500,
        'amount': 7500000000000.00,
        # 'amount_local': 7500000000000.00,
        # 'vendor_risk_score': 0.85,
        'receipt_quality': 0.65,
        'ocr_confidence': 0.90,
        # 'num_attendees': 3,
        # 'employee_risk_score': 0.70,
        'previous_violations': 1,
        'department': 'Engineering',
        # 'seniority' is omitted intentionally
        'category': 'Travel',
        'currency': 'INR',
        'vendor_country': 'US',
        'payment_method': 'Credit Card',
        # 'cost_center': 'B2',
        'expense_date': (datetime.today() - timedelta(days=5)).strftime('%Y-%m-%d'),
        'submission_date': datetime.today().strftime('%Y-%m-%d'),
        'requires_approval': 1,
        # 'high_risk_category': 1,
        'has_receipt': 1,
        'manual_review_required': 0,
        # Extra non-feature fields
        'vendor_id': 2600,
        'vendor_name': 'VendorA',
        # 'project_code': 'P200',
        'notes': '',
        'approval_status': 'Approved',
        'approval_date': datetime.today().strftime('%Y-%m-%d')
    }
    
    report = detector.predict_single(new_expense)
    print(report)


Required Inputs by Type:
  Numeric: ['amount', 'amount_local', 'vendor_risk_score', 'receipt_quality', 'ocr_confidence', 'num_attendees', 'employee_risk_score', 'previous_violations']
  Categorical: ['department', 'seniority', 'category', 'currency', 'vendor_country', 'payment_method', 'cost_center']
  Date: ['submission_date', 'expense_date']
  Binary: ['requires_approval', 'high_risk_category', 'has_receipt', 'manual_review_required']

--------------------------------------------------

Training Random Forest...

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.80      1.00      0.89        32
           1       0.00      0.00      0.00         8

    accuracy                           0.80        40
   macro avg       0.40      0.50      0.44        40
weighted avg       0.64      0.80      0.71        40

Confusion Matrix:
[[32  0]
 [ 8  0]]
Model saved to expense_detector_model.joblib

Policy Compliance Report
* Expens

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
