In [None]:
# Cell 1: Data Generation
import pandas as pd
import numpy as np

# --- Generate Synthetic Data ---
num_records = 1000
np.random.seed(42) # for reproducibility

data = {
    'BusinessAge_Years': np.random.randint(1, 20, size=num_records),
    'Directors_PersonalCreditScore': np.random.randint(300, 850, size=num_records),
    'AnnualTurnover_GBP': np.random.uniform(50000, 2000000, size=num_records),
    'CurrentRatio': np.random.uniform(0.5, 3.5, size=num_records),
    'DebtToEquityRatio': np.random.uniform(0.1, 4.0, size=num_records),
    'Industry': np.random.choice(['Logistics & Transport', 'Construction', 'Professional Services', 'Retail'], size=num_records, p=[0.4, 0.3, 0.2, 0.1]),
    'HasCCJ': np.random.choice([0, 1], size=num_records, p=[0.85, 0.15])
}

df = pd.DataFrame(data)

# --- Engineer the Target Variable ('Default') ---
risk_score = (
    -0.1 * (df['BusinessAge_Years'] - 10) +
    -0.2 * (df['Directors_PersonalCreditScore'] - 600) / 100 +
    -0.05 * (df['AnnualTurnover_GBP'] - 1000000) / 500000 +
    -0.15 * (df['CurrentRatio'] - 1.5) +
    0.2 * (df['DebtToEquityRatio'] - 1.0) +
    df['Industry'].replace({'Logistics & Transport': 0.1, 'Construction': 0.2, 'Professional Services': -0.2, 'Retail': 0.05}) +
    1.5 * df['HasCCJ']
)

prob_default = 1 / (1 + np.exp(-risk_score + np.random.normal(0, 0.5, size=num_records)))
df['Default_12M'] = (prob_default > 0.5).astype(int)

# --- Save the Dataset to the Colab Environment ---
df.to_csv('wex_b2b_synthetic_credit_data.csv', index=False)

print("Synthetic dataset created successfully!")
df.head()

Synthetic dataset created successfully!


  df['Industry'].replace({'Logistics & Transport': 0.1, 'Construction': 0.2, 'Professional Services': -0.2, 'Retail': 0.05}) +


Unnamed: 0,BusinessAge_Years,Directors_PersonalCreditScore,AnnualTurnover_GBP,CurrentRatio,DebtToEquityRatio,Industry,HasCCJ,Default_12M
0,7,757,853655.3,2.819267,0.551235,Professional Services,0,1
1,15,461,1121751.0,1.43,0.206275,Logistics & Transport,0,0
2,11,679,1669542.0,1.196119,2.141987,Construction,1,1
3,8,386,183990.0,1.874704,0.951214,Construction,0,1
4,7,354,877378.9,2.844047,3.958784,Logistics & Transport,0,1


In [None]:
# Cell 2: Model Training
!pip install xgboost shap -q

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb

# Load the dataset you just created
df = pd.read_csv('wex_b2b_synthetic_credit_data.csv')

# Prepare data for modeling
df = pd.get_dummies(df, columns=['Industry'], drop_first=True)
X = df.drop('Default_12M', axis=1)
y = df['Default_12M']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Initialize and train the model
model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='auc', use_label_encoder=False, n_estimators=150, learning_rate=0.05, max_depth=4, subsample=0.8, colsample_bytree=0.8)
model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_proba = model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"Model AUC Score: {auc_score:.4f}")

Model AUC Score: 0.9168


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
# Cell 3: AI Explanation Generation
import shap
import matplotlib.pyplot as plt
import numpy as np

# Create a SHAP explainer
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# --- 1. Generate and Save GLOBAL Explanation ---
plt.figure()
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
plt.title("Overall Feature Importance for Credit Risk")
plt.savefig('global_feature_importance.png', bbox_inches='tight')
plt.close()
print("Saved global_feature_importance.png")

# --- 2. Generate and Save LOCAL Explanation for a high-risk case ---
high_risk_index = np.argmax(y_pred_proba)
plt.figure()
shap.force_plot(explainer.expected_value, shap_values[high_risk_index, :], X_test.iloc[high_risk_index, :], matplotlib=True, show=False)
plt.title(f"Explanation for High-Risk Applicant (Index: {high_risk_index})")
plt.savefig('local_explanation_high_risk.png', bbox_inches='tight')
plt.close()
print("Saved local_explanation_high_risk.png")

# --- 3. Save the data for the high-risk applicant for context ---
X_test.iloc[[high_risk_index]].to_csv('high_risk_applicant_profile.csv')
print("Saved high_risk_applicant_profile.csv")

  shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)


Saved global_feature_importance.png
Saved local_explanation_high_risk.png
Saved high_risk_applicant_profile.csv
