<a href="https://colab.research.google.com/github/BrindhaHema/CreditRisk_StressTesting/blob/main/ai_stress_test_app_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
# file: ai_stress_test_app.py

import streamlit as st
import pandas as pd
import joblib
from io import BytesIO
import plotly.express as px
from sentence_transformers import SentenceTransformer
import fitz  # PyMuPDF for PDF OCR
import shap
# Import your genai Olama API/client (pseudo-code placeholder)
# import olama

st.set_page_config(page_title="GenAI Credit Stress Testing", layout="wide")
st.title("GenAI-Powered Credit Risk Stress Testing Platform")

# ==== Portfolio Data Upload ====
uploaded_file = st.file_uploader("Upload Credit Portfolio CSV", type="csv")
if uploaded_file:
    df = pd.read_csv(uploaded_file)
    st.write("Data Preview:", df.head())

# ==== Regulatory Document Upload & OCR ====
reg_pdf_file = st.file_uploader("Upload Regulatory PDF Document")
reg_chunks = []
if reg_pdf_file:
    doc = fitz.open(stream=reg_pdf_file.read(), filetype="pdf")
    for page in doc:
        reg_chunks.append(page.get_text("text"))
    st.write(f"OCR Extracted {len(reg_chunks)} text chunks from document.")

# ==== Scenario Generation with Embeddings & Olama LLM ====
llm_query = st.text_input("Describe stress scenario to generate (e.g., 'Interest rate shock')")
if st.button("Generate Stress Scenario"):
    # (1) Embedding chunk retrieval (using query for best chunk)
    embed_model = SentenceTransformer('all-MiniLM-L6-v2')
    chunk_embeddings = embed_model.encode(reg_chunks)
    query_embedding = embed_model.encode([llm_query])
    best_chunk = reg_chunks[(chunk_embeddings @ query_embedding.T).argmax()]
    st.write("Best matched scenario context:", best_chunk)
    # (2) LLM scenario JSON (pseudo-code)
    # scenario_json = olama.complete(f"Context: {best_chunk}. Return JSON with multipliers for PD, LGD, EAD.")
    scenario_json = {'PD_multiplier': 1.4, 'LGD_multiplier': 1.25, 'EAD_multiplier': 1.07}
    st.write("Generated Scenario Multipliers:", scenario_json)
else:
    scenario_json = {'PD_multiplier': 1.0, 'LGD_multiplier': 1.0, 'EAD_multiplier': 1.0}

# ==== ML Model Scoring (Load trained models) ====
if uploaded_file:
    model_pd = joblib.load('model_pd.joblib')
    model_lgd = joblib.load('model_lgd.joblib')
    model_ead = joblib.load('model_ead.joblib')
    feature_cols = ['income', 'dti', 'credit_score', 'ead', 'collateral_value']
    df['pd_ml'] = model_pd.predict(df[feature_cols])
    df['lgd_ml'] = model_lgd.predict(df[feature_cols])
    df['ead_ml'] = model_ead.predict(df[feature_cols])

    # ==== Apply Scenario Multipliers ====
    df['pd_stressed'] = df['pd_ml'] * scenario_json['PD_multiplier']
    df['lgd_stressed'] = df['lgd_ml'] * scenario_json['LGD_multiplier']
    df['ead_stressed'] = df['ead_ml'] * scenario_json['EAD_multiplier']
    df['EL_stressed'] = df['pd_stressed'] * df['lgd_stressed'] * df['ead_stressed']

    st.header("Dashboard: Segment Breakdown & Drilldown (Plotly)")
    tab1, tab2, tab3, tab4 = st.tabs(["By Sector", "By Region", "By Product Type", "Feature Explanation"])
    # Sector view
    with tab1:
        fig_sector = px.bar(df.groupby('sector').EL_stressed.sum().reset_index(), x='sector', y='EL_stressed')
        st.plotly_chart(fig_sector)
    # Region view
    with tab2:
        fig_region = px.bar(df.groupby('region').EL_stressed.sum().reset_index(), x='region', y='EL_stressed')
        st.plotly_chart(fig_region)
    # Product view
    with tab3:
        fig_prod = px.bar(df.groupby('product_type').EL_stressed.sum().reset_index(), x='product_type', y='EL_stressed')
        st.plotly_chart(fig_prod)

    # ==== SHAP Explanations ====
    with tab4:
        explainer = shap.Explainer(model_pd, df[feature_cols])
        shap_vals = explainer(df[feature_cols])
        st.subheader("Global Feature Importance")
        shap.plots.bar(shap_vals, show=False)
        st.pyplot(bbox_inches="tight")
        st.subheader("Single Prediction Waterfall (first borrower)")
        shap.plots.waterfall(shap_vals[0], show=False)
        st.pyplot(bbox_inches="tight")

    # ==== Download Full Results ====
    st.download_button("Download Full Stressed Portfolio (.csv)", data=df.to_csv(index=False), file_name="stressed_portfolio.csv")

    st.subheader("Summary Statistics")
    st.write(df[['EL_stressed', 'pd_stressed', 'lgd_stressed']].describe())

st.markdown("""
**App Features:**
- Regulatory PDF OCR and intelligent context extraction
- GenAI/LLM-powered scenario suggestion & multiplier calculation
- ML-based PD, LGD, EAD scoring; automated stress scenario application
- Full dashboard (Plotly): sector, region, product, drilldowns
- SHAP-based explainability; interactive waterfall plots
- Export and reporting, ready for pilots or board demonstrations
""")




DeltaGenerator()

In [2]:
try:
    import sentence_transformers
    print("'sentence_transformers' is installed.")
except ModuleNotFoundError:
    print("'sentence_transformers' is NOT installed.")

try:
    import shap
    print("'shap' is installed.")
except ModuleNotFoundError:
    print("'shap' is NOT installed.")

'sentence_transformers' is installed.
'shap' is installed.


In [None]:
pip install PyMuPDF

In [18]:

!pip install streamlit plotly sentence-transformers PyMuPDF shap joblib
!pip install pyngrok
from pyngrok import ngrok

# Launch Streamlit app in a subprocess
!streamlit run ai_stress_test_app.py &

# Setup ngrok tunnel
public_url = ngrok.connect(port="8501")
print("Streamlit app running at:", public_url)



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.188.186:8501[0m
[0m
[34m  Stopping...[0m




PyngrokNgrokHTTPError: ngrok client exception, API returned 400: {"error_code":102,"status_code":400,"msg":"invalid tunnel configuration","details":{"err":"yaml: unmarshal errors:\n  line 1: field port not found in type config.HTTPv2Tunnel"}}


In [4]:

!pip install streamlit plotly sentence-transformers PyMuPDF shap joblib
!pip install pyngrok
from pyngrok import ngrok

# Launch Streamlit app in a subprocess
!streamlit run ai_stress_test_app.py &

# Setup ngrok tunnel
public_url = ngrok.connect(port="8501")
print("Streamlit app running at:", public_url)



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.188.186:8501[0m
[0m
[34m  Stopping...[0m




PyngrokNgrokHTTPError: ngrok client exception, API returned 400: {"error_code":102,"status_code":400,"msg":"invalid tunnel configuration","details":{"err":"yaml: unmarshal errors:\n  line 1: field port not found in type config.HTTPv2Tunnel"}}


In [29]:
#brindha
!pip install streamlit plotly sentence-transformers PyMuPDF shap joblib
!pip install pyngrok
from pyngrok import ngrok

# Launch Streamlit app in a subprocess
!streamlit run ai_stress_test_app.py &

# Setup ngrok tunnel
public_url = ngrok.connect(port="8501")
print("Streamlit app running at:", public_url)



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.188.186:8501[0m
[0m




2025-11-14 07:12:39.754076: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763104359.794355   63660 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763104359.803594   63660 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1763104359.826960   63660 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763104359.827011   63660 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763104359.827017   63660 computation_placer.cc:177] computation placer alr



[31m──[0m[31m────────────────────────[0m[31m [0m[1;31mTraceback [0m[1;2;31m(most recent call last)[0m[31m [0m[31m─────────────────────────[0m[31m──[0m
[31m [0m [2;33m/usr/local/lib/python3.12/dist-packages/streamlit/runtime/scriptrunner/[0m[1;33mexec_code.py[0m: [31m [0m
[31m [0m [94m129[0m in [92mexec_func_with_error_handling[0m                                                 [31m [0m
[31m [0m                                                                                      [31m [0m
[31m [0m [2;33m/usr/local/lib/python3.12/dist-packages/streamlit/runtime/scriptrunner/[0m[1;33mscript_runner[0m [31m [0m
[31m [0m [1;33m.py[0m:[94m669[0m in [92mcode_to_exec[0m                                                              [31m [0m
[31m [0m                                                                                      [31m [0m
[31m [0m [2;33m/content/[0m[1;33mai_stress_test_app.py[0m:[94m54[0m in [92m<module>[0m    



PyngrokNgrokHTTPError: ngrok client exception, API returned 400: {"error_code":102,"status_code":400,"msg":"invalid tunnel configuration","details":{"err":"yaml: unmarshal errors:\n  line 1: field port not found in type config.HTTPv2Tunnel"}}


In [28]:
from pyngrok import ngrok

# Correct way: Pass the port number directly
# The 'addr' keyword argument is inferred.
tunnel = ngrok.connect(8501)
public_url = tunnel.public_url

print(f"ngrok URL: {public_url}")

ngrok URL: https://rainy-providencia-presumably.ngrok-free.dev


### Configure ngrok Authentication Token

To allow `ngrok` to create a public URL, you need to set your authentication token. Replace `<YOUR_AUTHTOKEN>` with the token found on your [ngrok dashboard](https://dashboard.ngrok.com/get-started/your-authtoken).

In [19]:
# Replace <YOUR_AUTHTOKEN> with your actual ngrok authtoken
!ngrok authtoken 35S7BQuaurZJSpv0SJLBpxEr8Xm_4KXzFub6j6mgbWHio1jsQ


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


### Run Streamlit App and ngrok Tunnel

Now, with the `ai_stress_test_app.py` file available and ngrok configured, you can launch the Streamlit application and create a public URL.

In [27]:
# notworking
!pip install streamlit plotly sentence-transformers PyMuPDF shap joblib
!pip install pyngrok
from pyngrok import ngrok

# Launch Streamlit app in a subprocess
!streamlit run ai_stress_test_app.py &

# Setup ngrok tunnel
public_url = ngrok.connect()
print("Streamlit app running at:", public_url)


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.188.186:8501[0m
[0m
[34m  Stopping...[0m
Streamlit app running at: NgrokTunnel: "https://rainy-providencia-presumably.ngrok-free.dev" -> "http://localhost:80"


In [25]:
%%writefile ai_stress_test_app.py
# file: ai_stress_test_app.py

import streamlit as st
import pandas as pd
import joblib
from io import BytesIO
import plotly.express as px
from sentence_transformers import SentenceTransformer
import fitz  # PyMuPDF for PDF OCR
import shap
# Import your genai Olama API/client (pseudo-code placeholder)
# import olama

st.set_page_config(page_title="GenAI Credit Stress Testing", layout="wide")
st.title("GenAI-Powered Credit Risk Stress Testing Platform")

# ==== Portfolio Data Upload ====
uploaded_file = st.file_uploader("Upload Credit Portfolio CSV", type="csv")
if uploaded_file:
    df = pd.read_csv(uploaded_file)
    st.write("Data Preview:", df.head())

# ==== Regulatory Document Upload & OCR ====
reg_pdf_file = st.file_uploader("Upload Regulatory PDF Document")
reg_chunks = []
if reg_pdf_file:
    doc = fitz.open(stream=reg_pdf_file.read(), filetype="pdf")
    for page in doc:
        reg_chunks.append(page.get_text("text"))
    st.write(f"OCR Extracted {len(reg_chunks)} text chunks from document.")

# ==== Scenario Generation with Embeddings & Olama LLM ====
llm_query = st.text_input("Describe stress scenario to generate (e.g., 'Interest rate shock')")
if st.button("Generate Stress Scenario"):
    # (1) Embedding chunk retrieval (using query for best chunk)
    embed_model = SentenceTransformer('all-MiniLM-L6-v2')
    chunk_embeddings = embed_model.encode(reg_chunks)
    query_embedding = embed_model.encode([llm_query])
    best_chunk = reg_chunks[(chunk_embeddings @ query_embedding.T).argmax()]
    st.write("Best matched scenario context:", best_chunk)
    # (2) LLM scenario JSON (pseudo-code)
    # scenario_json = olama.complete(f"Context: {best_chunk}. Return JSON with multipliers for PD, LGD, EAD.")
    scenario_json = {'PD_multiplier': 1.4, 'LGD_multiplier': 1.25, 'EAD_multiplier': 1.07}
    st.write("Generated Scenario Multipliers:", scenario_json)
else:
    scenario_json = {'PD_multiplier': 1.0, 'LGD_multiplier': 1.0, 'EAD_multiplier': 1.0}

# ==== ML Model Scoring (Load trained models) ====
if uploaded_file:
    model_pd = joblib.load('model_pd.joblib')
    model_lgd = joblib.load('model_lgd.joblib')
    model_ead = joblib.load('model_ead.joblib')
    feature_cols = ['income', 'dti', 'credit_score', 'ead', 'collateral_value']
    df['pd_ml'] = model_pd.predict(df[feature_cols])
    df['lgd_ml'] = model_lgd.predict(df[feature_cols])
    df['ead_ml'] = model_ead.predict(df[feature_cols])

    # ==== Apply Scenario Multipliers ====
    df['pd_stressed'] = df['pd_ml'] * scenario_json['PD_multiplier']
    df['lgd_stressed'] = df['lgd_ml'] * scenario_json['LGD_multiplier']
    df['ead_stressed'] = df['ead_ml'] * scenario_json['EAD_multiplier']
    df['EL_stressed'] = df['pd_stressed'] * df['lgd_stressed'] * df['ead_stressed']

    st.header("Dashboard: Segment Breakdown & Drilldown (Plotly)")
    tab1, tab2, tab3, tab4 = st.tabs(["By Sector", "By Region", "By Product Type", "Feature Explanation"])
    # Sector view
    with tab1:
        fig_sector = px.bar(df.groupby('sector').EL_stressed.sum().reset_index(), x='sector', y='EL_stressed')
        st.plotly_chart(fig_sector)
    # Region view
    with tab2:
        fig_region = px.bar(df.groupby('region').EL_stressed.sum().reset_index(), x='region', y='EL_stressed')
        st.plotly_chart(fig_region)
    # Product view
    with tab3:
        fig_prod = px.bar(df.groupby('product_type').EL_stressed.sum().reset_index(), x='product_type', y='EL_stressed')
        st.plotly_chart(fig_prod)

    # ==== SHAP Explanations ====
    with tab4:
        explainer = shap.Explainer(model_pd, df[feature_cols])
        shap_vals = explainer(df[feature_cols])
        st.subheader("Global Feature Importance")
        shap.plots.bar(shap_vals, show=False)
        st.pyplot(bbox_inches="tight")
        st.subheader("Single Prediction Waterfall (first borrower)")
        shap.plots.waterfall(shap_vals[0], show=False)
        st.pyplot(bbox_inches="tight")

    # ==== Download Full Results ====
    st.download_button("Download Full Stressed Portfolio (.csv)", data=df.to_csv(index=False), file_name="stressed_portfolio.csv")

    st.subheader("Summary Statistics")
    st.write(df[['EL_stressed', 'pd_stressed', 'lgd_stressed']].describe())

st.markdown("""
**App Features:**
- Regulatory PDF OCR and intelligent context extraction
- GenAI/LLM-powered scenario suggestion & multiplier calculation
- ML-based PD, LGD, EAD scoring; automated stress scenario application
- Full dashboard (Plotly): sector, region, product, drilldowns
- SHAP-based explainability; interactive waterfall plots
- Export and reporting, ready for pilots or board demonstrations
""")


Overwriting ai_stress_test_app.py


In [24]:
import joblib
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import os

# --- 1. Load Real Data and Feature Engineering (NEW LOGIC) ---
# Load the uploaded synthetic credit risk data.
try:
    # Assuming the file is in the Colab execution environment (/content/)
    df = pd.read_csv('/content/synthetic_credit_risk_data.csv')
except FileNotFoundError:
    print("Error: 'synthetic_credit_risk_data.csv' not found. Please ensure it is uploaded to /content/.")
    exit()

# --- Drop columns that are IDs, targets or post-default outcomes ---
# Note: 'ead', 'pd', 'lgd' are kept here as target variables for the models
# but dropped from the feature set X later.
drop_cols = [
    'borrower_id', 'default_flag', 'recovery'
]
df = df.drop(columns=drop_cols, errors='ignore')

# --- Date Feature Engineering ---
# Assuming 'loan_start_date' is the correct column name from the CSV (it was 'loanstartdate' in prior history, adjusting to the one present in your provided snippet)
df['loan_start_date'] = pd.to_datetime(df['loan_start_date'], errors='coerce')
df['loan_age_years'] = (pd.to_datetime('today') - df['loan_start_date']).dt.days / 365.25

# --- Dummy encoding for all categoricals ---
cat_cols = ['sector', 'region', 'collateral_type', 'product_type', 'loan_vintage']
# Adjusting column names to match the CSV header
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# --- Feature Selection: Retain only relevant numeric + engineered + encoded features ---
# Note: 'ead' is a feature in the model, but it is also a target variable for the EAD model.
# We'll use the original 'ead' column as a feature for PD and LGD models, and the target for the EAD model.
core_features = [
    'income', 'dti', 'credit_score',
    'collateral_value', 'loan_age_years', 'interest_rate', 'gdp_growth', 'unemployment',
    'arrears_30d', 'arrears_60d', 'arrears_90d', 'ead'
]
# Include all one-hot encoded columns for the categoricals
onehots = [col for col in df.columns if any(prefix in col for prefix in [
    'sector_', 'region_', 'collateral_type_', 'product_type_', 'loan_vintage_'])]

# Final list of feature column names (to be used later in the Streamlit app)
feature_cols = [col for col in core_features + onehots if col in df.columns]

# --- Final design matrix for training ---
X = df[feature_cols]

# Prepare targets (y)
y_pd = df['pd']
y_lgd = df['lgd']
y_ead = df['ead']


# --- 2. Train and Save Models ---

# Train and save PD Model
model_pd = RandomForestRegressor(n_estimators=10, random_state=42)
model_pd.fit(X, y_pd) # Trained on real 'pd' target
joblib.dump(model_pd, '/content/model_pd.joblib')

# Train and save LGD Model
model_lgd = RandomForestRegressor(n_estimators=10, random_state=42)
model_lgd.fit(X, y_lgd) # Trained on real 'lgd' target
joblib.dump(model_lgd, '/content/model_lgd.joblib')

# Train and save EAD Model
model_ead = RandomForestRegressor(n_estimators=10, random_state=42)
model_ead.fit(X, y_ead) # Trained on real 'ead' target
joblib.dump(model_ead, '/content/model_ead.joblib')

# *** NEW: Save the list of trained feature columns ***
joblib.dump(feature_cols, '/content/model_features.joblib')

print("Model files created and saved successfully to /content/ directory:")
print(f"Features used for training: {feature_cols}")
print(" - /content/model_pd.joblib (Trained on real 'pd' data)")
print(" - /content/model_lgd.joblib (Trained on real 'lgd' data)")
print(" - /content/model_ead.joblib (Trained on real 'ead' data)")
print(" - /content/model_features.joblib (Trained feature list saved)")

Model files created and saved successfully to /content/ directory:
Features used for training: ['income', 'dti', 'credit_score', 'collateral_value', 'loan_age_years', 'interest_rate', 'gdp_growth', 'unemployment', 'arrears_30d', 'arrears_60d', 'arrears_90d', 'ead', 'sector_Corporate', 'sector_Mortgage', 'sector_SME', 'region_North', 'region_South', 'region_West', 'collateral_type_Plant/Equipment', 'collateral_type_Property', 'collateral_type_Unsecured', 'product_type_Credit Card', 'product_type_Home Loan', 'product_type_Personal Loan', 'product_type_Term Loan', 'loan_vintage_1 yrs', 'loan_vintage_2 yrs', 'loan_vintage_3 yrs', 'loan_vintage_4 yrs', 'loan_vintage_5 yrs', 'loan_vintage_6 yrs', 'loan_vintage_7 yrs', 'loan_vintage_8 yrs']
 - /content/model_pd.joblib (Trained on real 'pd' data)
 - /content/model_lgd.joblib (Trained on real 'lgd' data)
 - /content/model_ead.joblib (Trained on real 'ead' data)
 - /content/model_features.joblib (Trained feature list saved)
