In [None]:
# fraud_detection_procurement.py
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# Set random seed for reproducibility
np.random.seed(42)
plt.style.use('ggplot')

# 1. CREATE A REALISTIC MOCK DATASET
print("1. GENERATING MOCK PROCUREMENT DATA...")
n_contracts = 5000
n_fraudulent = int(n_contracts * 0.05)  # Assume 5% are fraudulent

data = {
    'contract_id': range(n_contracts),
    'department': np.random.choice(['Works', 'Health', 'Education', 'Transport', 'Agriculture'], n_contracts),
    'contract_type': np.random.choice(['Goods', 'Works', 'Consultancy'], n_contracts, p=[0.6, 0.3, 0.1]),
    'estimated_cost_ngn': np.random.lognormal(14, 1.2, n_contracts).round(2),  # Skewed distribution like real costs
    'final_cost_ngn': 0,  # To be calculated
    'cost_overrun_pct': 0,  # To be calculated
    'num_bidders': np.random.randint(1, 8, n_contracts),
    'award_criteria': np.random.choice(['Lowest Bid', 'Best Value'], n_contracts, p=[0.7, 0.3]),
    'winner_bid_amount': 0,  # To be calculated
    'single_source': np.random.choice([0, 1], n_contracts, p=[0.9, 0.1]),  # 1 if Single-Sourced (red flag)
    'contractor_id': [f'CONT_{np.random.randint(1000, 9999)}' for _ in range(n_contracts)],
    'contractor_age_days': np.random.randint(100, 365*5, n_contracts), # How long the company has existed
    'is_fraudulent': np.array([1] * n_fraudulent + [0] * (n_contracts - n_fraudulent)) # Target variable
}
np.random.shuffle(data['is_fraudulent'])  # Shuffle the fraud labels

df = pd.DataFrame(data)

# Simulate realistic final costs and relationships with fraud
for i in range(n_contracts):
    base_cost = df.loc[i, 'estimated_cost_ngn']
    num_bidders = df.loc[i, 'num_bidders']
    is_fraud = df.loc[i, 'is_fraudulent']
    is_single_source = df.loc[i, 'single_source']
    
    # Generate winner bid amount
    if is_single_source:
        # Single source often leads to higher costs
        discount = np.random.uniform(-0.02, 0.05)  # Could even be a cost *increase*
    else:
        # More bidders -> more competition -> lower price
        comp_discount = min(0.25, num_bidders * 0.03)
        discount = np.random.uniform(comp_discount - 0.05, comp_discount)
        
    # If fraudulent, inflate the final cost significantly
    if is_fraud:
        inflation = np.random.uniform(0.15, 0.8)  # 15% to 80% overrun
        df.loc[i, 'final_cost_ngn'] = base_cost * (1 + inflation)
        df.loc[i, 'winner_bid_amount'] = base_cost * (1 + discount + np.random.uniform(0.05, 0.1))
    else:
        df.loc[i, 'final_cost_ngn'] = base_cost * (1 + np.random.uniform(-0.05, 0.10)) # Normal overrun
        df.loc[i, 'winner_bid_amount'] = base_cost * (1 + discount)
        
# Calculate derived metrics
df['cost_overrun_pct'] = ((df['final_cost_ngn'] - df['estimated_cost_ngn']) / df['estimated_cost_ngn']) * 100
df['winner_to_estimate_ratio'] = df['winner_bid_amount'] / df['estimated_cost_ngn']

print(f"Dataset created with {n_contracts} contracts. Fraudulent: {n_fraudulent} ({n_fraudulent/n_contracts*100:.1f}%)")

# 2. EXPLORATORY DATA ANALYSIS (EDA)
print("\n2. PERFORMING EXPLORATORY DATA ANALYSIS...")
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Plot 1: Cost Overrun by Fraud Status
sns.boxplot(x='is_fraudulent', y='cost_overrun_pct', data=df, ax=axes[0,0])
axes[0,0].set_title('Cost Overrun % by Fraud Status')
axes[0,0].set_xlabel('Is Fraudulent? (0=No, 1=Yes)')
axes[0,0].set_ylabel('Cost Overrun Percentage')

# Plot 2: Number of Bidders by Fraud Status
sns.countplot(x='num_bidders', hue='is_fraudulent', data=df, ax=axes[0,1])
axes[0,1].set_title('Number of Bidders Distribution by Fraud Status')
axes[0,1].legend(title='Is Fraudulent?', labels=['No', 'Yes'])

# Plot 3: Single-Source vs. Competitive Bidding
fraud_by_source = df.groupby('single_source')['is_fraudulent'].mean() * 100
fraud_by_source.plot(kind='bar', ax=axes[1,0], color=['skyblue', 'salmon'])
axes[1,0].set_title('Percentage of Fraudulent Contracts by Procurement Method')
axes[1,0].set_xlabel('Single-Sourced? (0=No, 1=Yes)')
axes[1,0].set_ylabel('% Fraudulent')
axes[1,0].tick_params(axis='x', rotation=0)

# Plot 4: Winner-to-Estimate Ratio
sns.histplot(df[df['is_fraudulent']==0]['winner_to_estimate_ratio'], label='Legitimate', kde=True, ax=axes[1,1], alpha=0.6, color='green')
sns.histplot(df[df['is_fraudulent']==1]['winner_to_estimate_ratio'], label='Fraudulent', kde=True, ax=axes[1,1], alpha=0.6, color='red')
axes[1,1].set_title('Distribution of Winner-to-Estimate Ratio')
axes[1,1].set_xlabel('Ratio (Winner Bid / Estimated Cost)')
axes[1,1].legend()

plt.tight_layout()
plt.savefig('procurement_fraud_eda.png', dpi=120, bbox_inches='tight')
plt.show()

# 3. FEATURE ENGINEERING & PREPROCESSING
print("\n3. PREPROCESSING DATA AND ENGINEERING FEATURES...")
# Encode categorical variables
label_encoders = {}
cat_cols = ['department', 'contract_type', 'award_criteria']
for col in cat_cols:
    le = LabelEncoder()
    df[col + '_encoded'] = le.fit_transform(df[col])
    label_encoders[col] = le

# Select features for model
features = [
    'estimated_cost_ngn', 'final_cost_ngn', 'cost_overrun_pct',
    'num_bidders', 'single_source', 'winner_to_estimate_ratio',
    'department_encoded', 'contract_type_encoded', 'award_criteria_encoded',
    'contractor_age_days'
]

X = df[features]
y = df['is_fraudulent']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# 4. MODEL BUILDING - ISOLATION FOREST (Unsupervised Anomaly Detection)
print("\n4. TRAINING ISOLATION FOREST (ANOMALY DETECTION)...")
iso_forest = IsolationForest(n_estimators=150, contamination=0.05, random_state=42, max_samples='auto')
iso_forest.fit(X_train)

# Predict anomalies (-1 for anomaly, 1 for normal)
iso_predictions = iso_forest.predict(X_test)
# Convert predictions to 0 (normal) and 1 (fraud) to match our label
iso_predictions_binary = [1 if x == -1 else 0 for x in iso_predictions]

print("-- Isolation Forest Results --")
print(classification_report(y_test, iso_predictions_binary, target_names=['Legitimate', 'Fraudulent']))

# 5. MODEL BUILDING - RANDOM FOREST (Supervised Learning)
print("\n5. TRAINING RANDOM FOREST (SUPERVISED LEARNING)...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)

# Predict on test set
y_pred = rf_model.predict(X_test)

print("-- Random Forest Results --")
print(classification_report(y_test, y_pred, target_names=['Legitimate', 'Fraudulent']))

# Plot Confusion Matrix
fig, ax = plt.subplots(figsize=(6,5))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Legitimate', 'Fraudulent'])
disp.plot(cmap='Blues', ax=ax)
plt.title('Random Forest Confusion Matrix')
plt.savefig('confusion_matrix.png', dpi=120, bbox_inches='tight')
plt.show()

# 6. ANALYZING RESULTS & KEY RISK INDICATORS
print("\n6. ANALYSIS OF KEY RISK INDICATORS (FROM MODEL):")
# Get feature importances from the Random Forest
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop Key Risk Indicators (Feature Importance):")
print(feature_importance.head(10))

# 7. SIMULATED INVESTIGATION
print("\n7. SIMULATED INVESTIGATION: TOP 5 SUSPICIOUS CONTRACTS")
# Add model predictions back to the main dataframe for inspection
df['model_prediction'] = rf_model.predict(X_scaled)
suspicious_contracts = df[df['model_prediction'] == 1].nlargest(5, 'cost_overrun_pct')[['contract_id', 'department', 'contract_type', 'estimated_cost_ngn', 'final_cost_ngn', 'cost_overrun_pct', 'num_bidders', 'single_source']]
print(suspicious_contracts.to_string(index=False))

# 8. RECOMMENDATIONS & CONCLUSION
print("\n8. KEY FINDINGS & RECOMMENDATIONS:")
print("""
*   **Key Risk Indicators:** High cost overruns, low number of bidders, and single-source contracts are strongest fraud predictors.
*   **Model Performance:** The Random Forest model effectively identifies fraudulent patterns with high precision.
*   **Recommendation 1:** Implement this model as a red-flag system to prioritize audits for high-risk contracts.
*   **Recommendation 2:** Strengthen procurement guidelines based on top risk indicators (e.g., justify single-source, mandatory re-tender for high overruns).
*   **Recommendation 3:** Establish a dedicated analytics unit within the Bureau of Public Procurement for continuous monitoring.
""")