In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

class ExpenseDataGenerator:
    def __init__(self):
        # Define Indian specific categories and their typical budget limits (in INR)
        self.expense_categories = {
            'Travel': {'local': 5000, 'outstation': 25000},
            'Meals': {'team': 15000, 'client': 30000},
            'Office Supplies': {'standard': 10000, 'technology': 50000},
            'Training': {'online': 20000, 'classroom': 40000},
            'Medical': {'routine': 15000, 'emergency': 50000}
        }
        
        self.departments = ['IT', 'Sales', 'HR', 'Finance', 'Operations']
        self.payment_methods = ['Corporate Card', 'Personal Card', 'Cash', 'UPI']
        self.currencies = ['INR', 'USD', 'EUR', 'GBP']
        self.vendor_countries = ['IN', 'US', 'SG', 'UAE']
        
    def generate_data(self, n_samples=1000):
        np.random.seed(42)
        data = []
        
        for i in range(n_samples):
            # Select random category and subcategory
            category = np.random.choice(list(self.expense_categories.keys()))
            subcategory = np.random.choice(list(self.expense_categories[category].keys()))
            budget_limit = self.expense_categories[category][subcategory]
            
            # Generate amount (sometimes exceeding budget for violation cases)
            is_violation_candidate = np.random.random() < 0.2
            if is_violation_candidate:
                amount = np.random.uniform(budget_limit, budget_limit * 2)
            else:
                amount = np.random.uniform(100, budget_limit)
                
            # Generate other fields
            receipt_quality = np.random.uniform(0, 1)
            has_justification = np.random.choice([0, 1], p=[0.1, 0.9])
            
            expense = {
                'expense_id': i + 1,
                'employee_id': np.random.randint(1000, 2000),
                'amount': round(amount, 2),
                'category': f"{category}-{subcategory}",
                'department': np.random.choice(self.departments),
                'payment_method': np.random.choice(self.payment_methods),
                'currency': np.random.choice(self.currencies),
                'vendor_country': np.random.choice(self.vendor_countries),
                'receipt_quality': receipt_quality,
                'ocr_confidence': np.random.uniform(0.5, 1) if receipt_quality > 0.3 else np.random.uniform(0, 0.5),
                'previous_violations': np.random.randint(0, 5),
                'requires_approval': 1 if amount > budget_limit * 0.8 else 0,
                'has_receipt': 1 if receipt_quality > 0.3 else 0,
                'has_justification': has_justification,
                'manual_review_required': 1 if amount > budget_limit else 0
            }
            
            # Determine if this is a violation based on multiple factors
            is_violation = (
                (amount > budget_limit) or  # Over budget
                (receipt_quality < 0.3 and amount > 1000) or  # Poor receipt quality for significant amount
                (not has_justification and amount > budget_limit * 0.8) or  # Missing justification for large amount
                (expense['previous_violations'] > 2)  # Multiple previous violations
            )
            
            expense['is_violation'] = 1 if is_violation else 0
            data.append(expense)
            
        return pd.DataFrame(data)

In [4]:
class ExpenseAnalyzer:
    def __init__(self):
        self.models = {}
        self.preprocessor = None
        self.category_limits = None
        
    def prepare_preprocessor(self, X):
        numeric_features = ['amount', 'receipt_quality', 'ocr_confidence', 'previous_violations']
        categorical_features = ['department', 'category', 'currency', 'vendor_country', 'payment_method']
        
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numeric_features),
                ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
            ])
        
        return self.preprocessor.fit_transform(X)
    
    def train_models(self, df):
        # Prepare features and target
        X = df.drop(['expense_id', 'employee_id', 'is_violation'], axis=1)
        y = df['is_violation']
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Prepare preprocessor and transform data
        X_train_processed = self.prepare_preprocessor(X_train)
        X_test_processed = self.preprocessor.transform(X_test)
        
        # Train multiple models
        models = {
            'random_forest': RandomForestClassifier(n_estimators=100, class_weight='balanced'),
            'gradient_boosting': GradientBoostingClassifier(n_estimators=100),
            'svm': SVC(probability=True, class_weight='balanced')
        }
        
        print("Training and evaluating models:")
        print("-" * 50)
        
        for name, model in models.items():
            print(f"\nTraining {name}...")
            model.fit(X_train_processed, y_train)
            y_pred = model.predict(X_test_processed)
            print(f"\n{name.upper()} Classification Report:")
            print(classification_report(y_test, y_pred))
            self.models[name] = model
            
        # Store category limits for budget violation checking
        self.category_limits = {}
        for cat in df['category'].unique():
            valid_expenses = df[df['is_violation'] == 0]
            self.category_limits[cat] = valid_expenses[valid_expenses['category'] == cat]['amount'].quantile(0.95)
    
    def analyze_expense(self, expense_data):
        # Convert single expense to DataFrame
        expense_df = pd.DataFrame([expense_data])
        
        # Preprocess the data
        X = self.preprocessor.transform(expense_df.drop(['expense_id', 'employee_id'], axis=1))
        
        # Get predictions from all models
        predictions = {}
        for name, model in self.models.items():
            pred = model.predict(X)[0]
            prob = model.predict_proba(X)[0]
            predictions[name] = {'prediction': pred, 'confidence': max(prob)}
        
        # Analyze specific violation types
        category = expense_data['category']
        amount = expense_data['amount']
        
        violations = []
        if category not in self.category_limits:
            violations.append("UNAUTHORIZED_CATEGORY")
        elif amount > self.category_limits[category]:
            violations.append("OVER_BUDGET")
        
        if expense_data['receipt_quality'] < 0.3:
            violations.append("POOR_RECEIPT_QUALITY")
        
        if not expense_data['has_justification'] and amount > 1000:
            violations.append("MISSING_JUSTIFICATION")
        
        return {
            'model_predictions': predictions,
            'violations': violations,
            'risk_score': len(violations) + sum(pred['prediction'] for pred in predictions.values()) / len(predictions)
        }

In [5]:
def test_expense_analyzer():
    # Generate synthetic data
    generator = ExpenseDataGenerator()
    training_data = generator.generate_data(1000)
    
    # Train the analyzer
    analyzer = ExpenseAnalyzer()
    analyzer.train_models(training_data)
    
    # Test with a new expense
    new_expense = {
        'expense_id': 1001,
        'employee_id': 1500,
        'amount': 75000.00,
        'category': 'Travel-outstation',
        'department': 'IT',
        'payment_method': 'Corporate Card',
        'currency': 'INR',
        'vendor_country': 'IN',
        'receipt_quality': 0.65,
        'ocr_confidence': 0.90,
        'previous_violations': 1,
        'requires_approval': 1,
        'has_receipt': 1,
        'has_justification': 0,
        'manual_review_required': 1
    }
    
    # Analyze the expense
    analysis = analyzer.analyze_expense(new_expense)
    
    # Print results
    print("\nExpense Analysis Results:")
    print("-" * 50)
    print("\nModel Predictions:")
    for model_name, results in analysis['model_predictions'].items():
        print(f"{model_name}: {'Violation' if results['prediction'] == 1 else 'Normal'} "
              f"(Confidence: {results['confidence']:.2f})")
    
    print("\nDetected Violations:")
    if analysis['violations']:
        for violation in analysis['violations']:
            print(f"- {violation}")
    else:
        print("No specific violations detected")
    
    print(f"\nOverall Risk Score: {analysis['risk_score']:.2f}")

# Run the test
test_expense_analyzer()

Training and evaluating models:
--------------------------------------------------

Training random_forest...

RANDOM_FOREST Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.94        62
           1       0.99      0.96      0.97       138

    accuracy                           0.96       200
   macro avg       0.95      0.97      0.96       200
weighted avg       0.97      0.96      0.97       200


Training gradient_boosting...

GRADIENT_BOOSTING Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        62
           1       0.99      0.99      0.99       138

    accuracy                           0.99       200
   macro avg       0.99      0.99      0.99       200
weighted avg       0.99      0.99      0.99       200


Training svm...

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.91  