# Generating insights from the model's predictions

In [2]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
np.set_printoptions(precision=3, suppress=True)

from joblib import load

# Interpretability
import shap

import os
import openai
from openai import OpenAI
from dotenv import load_dotenv

# Set transformers output to Pandas DataFrame instead of NumPy array
from sklearn import set_config
set_config(transform_output="pandas")

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


## Loading and prepare the data

**Loading data**

In [3]:
X_test = pd.read_csv('../data/raw/loan-data-test.csv')
X_test.drop('Loan_ID', axis=1, inplace=True)

**Loading preprocessing pipeline and model**

In [4]:
with open('../models/model.pkl', 'rb') as f:
    model = load(f)

with open('../models/preprocessor.pkl', 'rb') as f:
    preprocessor = load(f)

**Preprocessing**

In [5]:
# Mapping number of dependents to numerical values
X_test.Dependents.replace('3+', 3, inplace=True)

# Transforming the test set
X_test = preprocessor.transform(X_test)

## Getting the SHAP values

**Unscaling the data**

In [6]:
# Get the scaler and encoder object from the pipeline
scaler = preprocessor.named_transformers_['numerical']['scaler']
encoder = preprocessor.named_transformers_['categorical']['onehot']

# Unscale the  data 
X_test_num_unscaled = scaler.inverse_transform(X_test[scaler.feature_names_in_])
X_test_num_unscaled_df = pd.DataFrame(data=X_test_num_unscaled, columns=scaler.feature_names_in_)

# Get the one-hot encoded features
X_test_cat = X_test[encoder.get_feature_names_out()]

# Reset the index before concatenation
X_test_num_unscaled_df.reset_index(drop=True, inplace=True)
X_test_cat.reset_index(drop=True, inplace=True)

# Concat the unscaled numeric data and the categorical data
X_test_unscaled = pd.concat([X_test_num_unscaled_df, X_test_cat], axis=1)

### Creating a shap explainer

In [7]:
explainer = shap.LinearExplainer(model, X_test)
shap_explanation = explainer(X_test)

# Replace the scaled values by the original ones
shap_explanation.data = X_test_unscaled.values

## Generate Insights

### Format explanations in JSON

This is a simplified version of the function [explanation_to_json](../src/reporting/utils.py) that is used in the web app.
The original functions performs additional steps to make the explanations more readable.

In [8]:
feature_names = X_test.columns
explanation_jsons = []
i = 10

# Predict the status
predicted_status = model.predict(X_test.iloc[[i]])

# Predict the probability
predicted_proba = np.round(
    model.predict_proba(X_test.iloc[[i]])[0][1] * 100
).astype(int)

# Get the explanation for the predicted status
for name, value, shap_value in zip(feature_names, shap_explanation.data[i], shap_explanation.values[i]):
    explanation_json = {}
    explanation_json["Name"] = name
    explanation_json["Value"] = value
    explanation_json["SHAP Value"] = shap_value
    explanation_json["Effect on Approval"] = "Positive" if shap_value > 0 else "Negative"

    explanation_jsons.append(explanation_json)

# explanation_jsons

In [9]:
print(shap_explanation.data[0])
print(shap_explanation.values[0])

[   0. 5720.    0.  110.  360.    1.    0.    0.    0.    1.    1.]
[-0.083  0.147 -0.317  0.029 -0.006 -0.012  0.027  0.116 -0.115 -0.047
  0.531]


In [12]:
explanation_jsons = []
feature_names = [
    'Dependents', 'Applicant Income', 'Coapplicant Income', 'Loan Amount',
    'Loan Amount Term', 'Gender', 
    # 'Married', 'Education', 
    'Self Employed', 'Property Area: Rural',
    'Property Area: Semi-urban', 'Property Area: Urban', 'Has Credit History'
]

numerical_features = [
    'Applicant Income', 'Coapplicant Income', 
    'Loan Amount', 'Loan Amount Term'
]

explanation_jsons = []

for name, value, shap_value in zip(feature_names, 
    shap_explanation.data[0], shap_explanation.values[0]):     
            
    explanation_json = {}

    # Deleted features
    # elif name == "Married":
    #     value = "Yes" if value == 1 else "No"
    # elif name == "Education":
    #     value = "Not Graduate" if value == 1 else "Graduate"
    
    # > Map the values to strings for interpretability
    if name == "Gender":
        value = "Male" if value == 1 else "Female"
    elif name == "Self Employed":
        value = "Yes" if value == 1 else "No"
    elif name == "Has Credit History":
        value = "Yes" if value == 1 else "No"

    # > Map "Property Area" to it's original category
    # keep only the value that is equal to 1 
    # since the property area is one-hot encoded
    elif name == "Property Area: Rural":
        if value == 1:
            name = "Property Area"
            value = "Rural"
        else:
            continue
    elif name == "Property Area: Semi-urban":
        if value == 1:
            name = "Property Area"
            value = "Semi-urban"
        else:
            continue
    elif name == "Property Area: Urban":
        if value == 1:
            name = "Property Area"
            value = "Urban"
        else:
            continue


    # Map the "Loan Amount" values to thousands
    # since the original data is in thousands
    elif name == "Loan Amount":
        value = value * 1000

    explanation_json["Name"] = name
    
    # Round numerical features value
    if name in numerical_features:
        explanation_json["Value"] = round(value)
    else:
        explanation_json["Value"] = value

    explanation_json["SHAP Value"] = shap_value
    explanation_json["Effect on Approval"] = "Positive" if shap_value > 0 else "Negative"
    
    explanation_jsons.append(explanation_json)

print(explanation_jsons)


[{'Name': 'Dependents', 'Value': 0.0, 'SHAP Value': -0.08346984991178112, 'Effect on Approval': 'Negative'}, {'Name': 'Applicant Income', 'Value': 5720, 'SHAP Value': 0.14706541892812847, 'Effect on Approval': 'Positive'}, {'Name': 'Coapplicant Income', 'Value': 0, 'SHAP Value': -0.3171539419119009, 'Effect on Approval': 'Negative'}, {'Name': 'Loan Amount', 'Value': 110000, 'SHAP Value': 0.029009653363208033, 'Effect on Approval': 'Positive'}, {'Name': 'Loan Amount Term', 'Value': 360, 'SHAP Value': -0.005628890210890795, 'Effect on Approval': 'Negative'}, {'Name': 'Gender', 'Value': 'Male', 'SHAP Value': -0.011940938055301362, 'Effect on Approval': 'Negative'}, {'Name': 'Self Employed', 'Value': 'No', 'SHAP Value': 0.026858270835239666, 'Effect on Approval': 'Positive'}, {'Name': 'Property Area', 'Value': 'Urban', 'SHAP Value': -0.04697147224187069, 'Effect on Approval': 'Negative'}, {'Name': 'Has Credit History', 'Value': 'Yes', 'SHAP Value': 0.5307220385544934, 'Effect on Approval':

In [11]:
# print("\n\n---------->\nshap_explanation")
# print(shap_explanation.data[0])
# # print(">")
# print(shap_explanation.values[0])
# print(feature_names)

In [13]:
pd.DataFrame(explanation_jsons)

Unnamed: 0,Name,Value,SHAP Value,Effect on Approval
0,Dependents,0.0,-0.08347,Negative
1,Applicant Income,5720,0.147065,Positive
2,Coapplicant Income,0,-0.317154,Negative
3,Loan Amount,110000,0.02901,Positive
4,Loan Amount Term,360,-0.005629,Negative
5,Gender,Male,-0.011941,Negative
6,Self Employed,No,0.026858,Positive
7,Property Area,Urban,-0.046971,Negative
8,Has Credit History,Yes,0.530722,Positive


**Visualizing the JSON arrays as a DataFrame**

In [14]:
explanation_df = pd.DataFrame(explanation_jsons)

# Sort feature by impact score
explanation_df["SHAP Value (Abs)"] = explanation_df["SHAP Value"].abs()
explanation_df.sort_values(by="SHAP Value (Abs)", ascending=False, inplace=True)
explanation_df.drop("SHAP Value (Abs)", axis=1, inplace=True)
explanation_df.reset_index(drop=True, inplace=True)
explanation_df

Unnamed: 0,Name,Value,SHAP Value,Effect on Approval
0,Has Credit History,Yes,0.530722,Positive
1,Coapplicant Income,0,-0.317154,Negative
2,Applicant Income,5720,0.147065,Positive
3,Dependents,0.0,-0.08347,Negative
4,Property Area,Urban,-0.046971,Negative
5,Loan Amount,110000,0.02901,Positive
6,Self Employed,No,0.026858,Positive
7,Gender,Male,-0.011941,Negative
8,Loan Amount Term,360,-0.005629,Negative


## Generation GPT predictions

In [15]:
_ = load_dotenv("../credentials.env")
openai.api_key = os.environ['OPENAI_API_KEY']

### Prompting GPT-3.5

In [17]:
# You are a loan eligibility officer. 
# Behind our system, we use a machine learning model to predict the loan status of the applicant.
system_prompt = """
The system evaluates loan applications using applicant data. 
You need to explain the system's decision, considering features and their impacts, and this explanation is tailored for the non-technical applicant. 
No greetings or closings are necessary. 
Emphasize the features that had the most influence on the system's decision and how they affected that decision.
When you mention a feature, include the feature's name and value.
Use the term "system" to reference the model and avoid technical jargon related to the SHAP values.

IMPORTANT
---------
Higher ApplicantIncome, CoapplicantIncome and LoanAmount are associated with a higher probability of approval. 
Higher LoanAmount and Loan_Amount_Term are associated with a lower probability of approval.
Loan Amount ranges from $9 to $700 (in thousands).
Loan Amount Term ranges from 12 to 480 months.
"""

system_prompt = """
The system evaluates loan applications using applicant data. 
You need to explain the system's decision, considering features and their impacts, and this explanation is tailored for the non-technical applicant. 
No greetings or closings are necessary. 
Emphasize the features that had the most influence on the system's decision and how they affected that decision.
When you mention a feature, include the feature's name and value.
Use the term "system" to reference the model and avoid technical jargon related to the SHAP values.

IMPORTANT
---------
Higher ApplicantIncome, CoapplicantIncome and LoanAmount are associated with a higher probability of approval. 
Higher LoanAmount and Loan_Amount_Term are associated with a lower probability of approval.
Loan Amount ranges from $9 to $700 (in thousands).
Loan Amount Term ranges from 12 to 480 months.
"""

response_template = """
Your loan application has been approved. Several factors contributed to this decision.

### What you did well
- **Income**: You have an income of \$4,235. This factor significantly boosts your chances of approval as a higher income increases the likelihood of getting the loan approved.
- **Co-applicant's Income**: You have a co-applicant with an income of \$3000. This factor significantly boosts your chances of approval, as a higher co-applicant income increases the likelihood of getting the loan approved.
- **Requested Loan Amount:** Your loan request of \$77,000 falls within the lower range of our allowable amount, which spans from \$9,000 to \$700,000. This contributed positively to the approval decision.
- **Credit History:** You have a credit history, which is required for loan approval.

### What you need to work on
- **Loan Term Duration:** The chosen loan term of 360 months (30 years) exceeds the midpoint in our range of 12 to 480 months. Opting for a longer loan term slightly diminishes your chances of approval.

IMPORTANT:
- Do not mention the gender as it is not a factor to improve or work on.
- Do not recommend improve the applican't or co-applicant's income as it is not something that can be improved.
"""
# In conclusion, the factors that most impacted the decision were your income and the co-applicant's income, the requested loan amount and its duration, along with the number of dependents.


query = f"""
Below are the definitions of the features:
- Dependents: Number of dependents of the applicant 
- ApplicantIncome: Income of the applicant
- CoapplicantIncome: Income of the co-applicant
- LoanAmount: Loan amount 
- Loan_Amount_Term: Term of the loan in months
- Gender: then gender of the applicant
- Self Employed: wheather the applicant is self-employed or not
- Property Area:Rural: "Yes" if the property is in a rural area, "No" otherwise
- PropertyArea: Semiurban: "Yes" if the property is in a semiurban area, "No" otherwise
- Property_Area: Urban: "Yes" if the property is in an urban area, "No" otherwise
- Has Credit History: "Yes" if the applicant has a credit history, "No" otherwise

Below are the names, values, SHAP values, and effects for each prediction in a JSON format:
{explanation_jsons}

Below is the prediction of the model:
Predicted status: {predicted_status}
Probability of approval: {predicted_proba}%

-----
Based on the information on feature names, values, SHAP values, and effects, 
generate a report to explain the model's decision in simple terms.
Below is an example of response so that you can get the pattern,
rewrite it to fit the current context based on the information above
but Keep the same markdown structure (e.g. for the level 3 titles ###).
The bulleted list should be ordered by magnitude of impact.
{response_template}

Conclude with a summary of the most important factors and their effects on the decision.

Recommend actions to improve the chances of approval.
"""

print(query)


Below are the definitions of the features:
- Dependents: Number of dependents of the applicant 
- ApplicantIncome: Income of the applicant
- CoapplicantIncome: Income of the co-applicant
- LoanAmount: Loan amount 
- Loan_Amount_Term: Term of the loan in months
- Gender: then gender of the applicant
- Self Employed: wheather the applicant is self-employed or not
- Property Area:Rural: "Yes" if the property is in a rural area, "No" otherwise
- PropertyArea: Semiurban: "Yes" if the property is in a semiurban area, "No" otherwise
- Property_Area: Urban: "Yes" if the property is in an urban area, "No" otherwise
- Has Credit History: "Yes" if the applicant has a credit history, "No" otherwise

Below are the names, values, SHAP values, and effects for each prediction in a JSON format:
[{'Name': 'Dependents', 'Value': 0.0, 'SHAP Value': -0.08346984991178112, 'Effect on Approval': 'Negative'}, {'Name': 'Applicant Income', 'Value': 5720, 'SHAP Value': 0.14706541892812847, 'Effect on Approval': 

### Generating the report

In [18]:
import re

def selectively_escape_dollar_sign(string):
    def replace_unescaped(match):
        # Replace unescaped dollar signs with their escaped form
        return match.group().replace('$', r'\$')

    # Use a regular expression to find unescaped dollar signs
    pattern = re.compile(r'(?<!\\)\$')
    return pattern.sub(replace_unescaped, string)

# Example usage:
query_template = "This is $ a string with $ signs, but \\$ some are already escaped: \\$ \\$."

escaped_query_template = selectively_escape_dollar_sign(query_template)

print("Original:", query_template)
print("Escaped:", escaped_query_template)


Original: This is $ a string with $ signs, but \$ some are already escaped: \$ \$.
Escaped: This is \$ a string with \$ signs, but \$ some are already escaped: \$ \$.


In [19]:
def generate_response(system_prompt, query):
	"""Generate a response to a query based on a system prompt"""
	completion = openai.chat.completions.create(
		model="gpt-3.5-turbo",
		messages=[
			{"role": "system", "content": system_prompt},
			{"role": "user", "content": query}
		]
	)

	return completion.choices[0].message.content

In [24]:
response = generate_response(system_prompt, query)
print(response)

Congratulations! Your loan application has been approved. The system evaluated several factors to make this decision. Let's take a closer look at these factors and their effects on the approval decision.

### Factors that contributed to the approval decision:

- **Income**: Your income of $5,720 positively influenced the approval decision. A higher income increases the likelihood of getting the loan approved.

- **Credit History**: Having a credit history played a significant role in the approval decision. The system considers this as a crucial factor, and your credit history being positive positively impacted the decision.

- **Requested Loan Amount**: The system took into account the loan amount you requested, which was $110,000. This amount falls within the acceptable range of loan amounts, which spans from $9,000 to $700,000. Your loan request contributed to the approval decision.

### Factors to be aware of:

- **Loan Term Duration**: The chosen loan term of 360 months (30 years) 

## Save the report

In [21]:
# Rename SHAP value to Impact Score for better understanding
explanation_df.rename(columns={"SHAP Value": "Impact Score"}, inplace=True)

# Convert my dataframe as markdown table
print(explanation_df.to_markdown())

|    | Name               | Value   |   Impact Score | Effect on Approval   |
|---:|:-------------------|:--------|---------------:|:---------------------|
|  0 | Has Credit History | Yes     |     0.530722   | Positive             |
|  1 | Coapplicant Income | 0       |    -0.317154   | Negative             |
|  2 | Applicant Income   | 5720    |     0.147065   | Positive             |
|  3 | Dependents         | 0.0     |    -0.0834698  | Negative             |
|  4 | Property Area      | Urban   |    -0.0469715  | Negative             |
|  5 | Loan Amount        | 110000  |     0.0290097  | Positive             |
|  6 | Self Employed      | No      |     0.0268583  | Positive             |
|  7 | Gender             | Male    |    -0.0119409  | Negative             |
|  8 | Loan Amount Term   | 360     |    -0.00562889 | Negative             |


In [22]:
# We will save the report as Markdown file
# The report will contains the dataframe as a table and the response from GPT-3
report = f"""
# Loan Approval Decision Report

## Applicant Information

{explanation_df.to_markdown()}

## Model Decision

{response}
"""

with open('../reports/loan_approval_decision_report.md', 'w') as f:
    f.write(report)

In [23]:
print(report)


# Loan Approval Decision Report

## Applicant Information

|    | Name               | Value   |   Impact Score | Effect on Approval   |
|---:|:-------------------|:--------|---------------:|:---------------------|
|  0 | Has Credit History | Yes     |     0.530722   | Positive             |
|  1 | Coapplicant Income | 0       |    -0.317154   | Negative             |
|  2 | Applicant Income   | 5720    |     0.147065   | Positive             |
|  3 | Dependents         | 0.0     |    -0.0834698  | Negative             |
|  4 | Property Area      | Urban   |    -0.0469715  | Negative             |
|  5 | Loan Amount        | 110000  |     0.0290097  | Positive             |
|  6 | Self Employed      | No      |     0.0268583  | Positive             |
|  7 | Gender             | Male    |    -0.0119409  | Negative             |
|  8 | Loan Amount Term   | 360     |    -0.00562889 | Negative             |

## Model Decision

Your loan application has been approved. Several factors contr