# Operational Risk Predictor Testing

This notebook tests the Operational Risk Random Forest model from `New_ML_Models` on test data from the `Testing` folder.


In [1]:
# Import libraries
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix
)
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries imported successfully")


✓ Libraries imported successfully


## 1. Load Test Data


In [2]:
# Load test data
test_data_dir = "D:/Deloitte/Testing/"
operational_risk_test = pd.read_csv(f"{test_data_dir}cashiers_risk_test.csv")

print(f"✓ Operational risk test data: {len(operational_risk_test)} rows")


✓ Operational risk test data: 610 rows


## 2. Test Operational Risk Predictor (Random Forest)


In [5]:
# Load operational risk predictor model
print("Loading Operational Risk Predictor Random Forest model...")

op_risk_model_dir = "D:/Deloitte/New_ML_Models/Operational_risk_predictors/models/"
op_risk_model = joblib.load(os.path.join(op_risk_model_dir, 'random_forest_model.pkl'))
op_risk_scaler = joblib.load(os.path.join(op_risk_model_dir, 'feature_scaler.pkl'))
op_risk_features = joblib.load(os.path.join(op_risk_model_dir, 'feature_columns.pkl'))

print(f"✓ Operational risk model loaded")
print(f"  Feature columns: {op_risk_features}")

# Test operational risk predictor
def predict_operational_risk(row):
    """Predict operational risk using Random Forest"""
    # Map test data columns to expected features
    # Expected features: balance_discrepancy_pct_mean, balance_discrepancy_pct_max, 
    # transaction_total_count, closing_balance_mean, total_amount_mean, 
    # cash_amount_mean, balance_discrepancy_risk, balance_variance_risk
    
    feature_values = []
    for feat in op_risk_features:
        if feat in row.index:
            val = float(row[feat]) if pd.notna(row[feat]) else 0.0
        else:
            # Try to map from available columns
            if feat == 'balance_discrepancy_pct_mean':
                val = float(row.get('balance_discrepancy_pct_max', 0.0)) if pd.notna(row.get('balance_discrepancy_pct_max', 0.0)) else 0.0
            elif feat == 'transaction_total_count':
                val = float(row.get('num_transactions_sum', 0.0)) if pd.notna(row.get('num_transactions_sum', 0.0)) else 0.0
            elif feat == 'total_amount_mean':
                val = float(row.get('transaction_total_sum', 0.0)) if pd.notna(row.get('transaction_total_sum', 0.0)) else 0.0
            elif feat == 'closing_balance_mean':
                val = 0.0  # Not available in test data
            elif feat == 'cash_amount_mean':
                val = 0.0  # Not available in test data
            elif feat == 'balance_discrepancy_risk':
                # Calculate based on threshold
                max_disc = float(row.get('balance_discrepancy_pct_max', 0.0)) if pd.notna(row.get('balance_discrepancy_pct_max', 0.0)) else 0.0
                val = 1.0 if max_disc > 1000.0 else 0.0
            elif feat == 'balance_variance_risk':
                # Calculate based on variance
                diff_sum = abs(float(row.get('balance_diff_sum', 0.0))) if pd.notna(row.get('balance_diff_sum', 0.0)) else 0.0
                val = 1.0 if diff_sum > 10000.0 else 0.0
            else:
                val = 0.0
        feature_values.append(val)
    
    # Scale features
    features_scaled = op_risk_scaler.transform([feature_values])
    
    # Predict
    prediction = op_risk_model.predict(features_scaled)[0]
    probability = op_risk_model.predict_proba(features_scaled)[0][1]
    
    return prediction, probability

# Test on operational risk test data
op_risk_results = []
for idx, row in operational_risk_test.iterrows():
    try:
        pred, prob = predict_operational_risk(row)
        actual = int(row['risk_level']) if pd.notna(row['risk_level']) else None
        
        op_risk_results.append({
            'cashier_id': row.get('cashier_id', idx),
            'actual_risk': actual,
            'predicted_risk': int(pred),
            'risk_probability': float(prob)
        })
    except Exception as e:
        print(f"Error at row {idx}: {e}")
        op_risk_results.append({
            'cashier_id': row.get('cashier_id', idx),
            'actual_risk': None,
            'predicted_risk': None,
            'risk_probability': None,
            'error': str(e)
        })

op_risk_df = pd.DataFrame(op_risk_results)

# Calculate metrics if actual values are available
if op_risk_df['actual_risk'].notna().sum() > 0:
    valid_mask = op_risk_df['actual_risk'].notna() & op_risk_df['predicted_risk'].notna()
    if valid_mask.sum() > 0:
        actual_vals = op_risk_df.loc[valid_mask, 'actual_risk']
        pred_vals = op_risk_df.loc[valid_mask, 'predicted_risk']
        
        accuracy = accuracy_score(actual_vals, pred_vals)
        
        print(f"\n✓ Operational risk predictor tested on {len(op_risk_results)} samples")
        print(f"  Valid predictions: {valid_mask.sum()}")
        print(f"  Accuracy: {accuracy:.4f}")
        print(f"\nClassification Report:")
        print(classification_report(actual_vals, pred_vals))
else:
    print(f"\n✓ Operational risk predictor tested on {len(op_risk_results)} samples")
    print(f"  Predictions made: {op_risk_df['predicted_risk'].notna().sum()}")


Loading Operational Risk Predictor Random Forest model...
✓ Operational risk model loaded
  Feature columns: ['balance_discrepancy_pct_mean', 'balance_discrepancy_pct_max', 'transaction_total_count', 'closing_balance_mean', 'total_amount_mean', 'cash_amount_mean', 'balance_discrepancy_risk', 'balance_variance_risk']

✓ Operational risk predictor tested on 610 samples
  Valid predictions: 610
  Accuracy: 0.8016

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.74      0.85       459
           1       0.56      1.00      0.71       151

    accuracy                           0.80       610
   macro avg       0.78      0.87      0.78       610
weighted avg       0.89      0.80      0.81       610



## 3. Export Results


In [6]:
# Create exports directory if it doesn't exist
exports_dir = "Testing/exports"
os.makedirs(exports_dir, exist_ok=True)

# Export results to CSV
op_risk_df.to_csv(f"{exports_dir}/operational_risk_predictor_results.csv", index=False)

print("✓ Results exported to CSV file:")
print(f"  - {exports_dir}/operational_risk_predictor_results.csv")


✓ Results exported to CSV file:
  - Testing/exports/operational_risk_predictor_results.csv
