# Generating insights from the model's predictions

In [31]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
np.set_printoptions(precision=3, suppress=True)

from joblib import load

# Interpretability
import shap

import os
import openai
from openai import OpenAI
from dotenv import load_dotenv

# Set transformers output to Pandas DataFrame instead of NumPy array
from sklearn import set_config
set_config(transform_output="pandas")

## Loading and prepare the data

**Loading data**

In [32]:
X_test = pd.read_csv('../data/raw/loan-data-test.csv')
X_test.drop('Loan_ID', axis=1, inplace=True)

**Loading preprocessing pipeline and model**

In [33]:
with open('../models/model.pkl', 'rb') as f:
    model = load(f)

with open('../models/preprocessor.pkl', 'rb') as f:
    preprocessor = load(f)

**Preprocessing**

In [34]:
# Mapping number of dependents to numerical values
X_test.Dependents.replace('3+', 3, inplace=True)

# Transforming the test set
X_test = preprocessor.transform(X_test)

## Getting the SHAP values

**Unscaling the data**

In [35]:
# Get the scaler and encoder object from the pipeline
scaler = preprocessor.named_transformers_['numerical']['scaler']
encoder = preprocessor.named_transformers_['categorical']['onehot']

# Unscale the  data 
X_test_num_unscaled = scaler.inverse_transform(X_test[scaler.feature_names_in_])
X_test_num_unscaled_df = pd.DataFrame(data=X_test_num_unscaled, columns=scaler.feature_names_in_)

# Get the one-hot encoded features
X_test_cat = X_test[encoder.get_feature_names_out()]

# Reset the index before concatenation
X_test_num_unscaled_df.reset_index(drop=True, inplace=True)
X_test_cat.reset_index(drop=True, inplace=True)

# Concat the unscaled numeric data and the categorical data
X_test_unscaled = pd.concat([X_test_num_unscaled_df, X_test_cat], axis=1)

### Creating a shap explainer

In [36]:
explainer = shap.LinearExplainer(model, X_test)
shap_explanation = explainer(X_test)

# Replace the scaled values by the original ones
shap_explanation.data = X_test_unscaled.values

## Generate Insights

In [37]:
feature_names = X_test.columns
explanations_json = []
i = 88

# Predict the status
predicted_status = model.predict(X_test.iloc[[i]])

# Predict the probability
predicted_proba = np.round(
    model.predict_proba(X_test.iloc[[i]])[0][1] * 100
).astype(int)

# Get the explanation for the predicted status
for name, value, shap_value in zip(feature_names, shap_explanation.data[i], shap_explanation.values[i]):
    explanation_json = {}
    explanation_json["Name"] = name
    explanation_json["Value"] = value
    explanation_json["SHAP Value"] = shap_value
    explanation_json["Effect on Approval"] = "Positive" if shap_value > 0 else "Negative"

    explanations_json.append(explanation_json)

# explanations_json

In [38]:
explanations_df = pd.DataFrame(explanations_json)

# Sort feature by impact score
explanations_df["SHAP Value (Abs)"] = explanations_df["SHAP Value"].abs()
explanations_df.sort_values(by="SHAP Value (Abs)", ascending=False, inplace=True)
explanations_df.drop("SHAP Value (Abs)", axis=1, inplace=True)
explanations_df

Unnamed: 0,Name,Value,SHAP Value,Effect on Approval
10,Credit_History_1.0,1.0,0.530722,Positive
8,Property_Area_Semiurban,1.0,0.328659,Positive
2,CoapplicantIncome,2015.0,0.25393,Positive
3,LoanAmount,77.0,0.144911,Positive
0,Dependents,2.0,0.121532,Positive
7,Property_Area_Rural,0.0,0.116017,Positive
9,Property_Area_Urban,0.0,0.034014,Positive
6,Self_Employed_Yes,0.0,0.026858,Positive
5,Gender_Male,1.0,-0.011941,Negative
4,Loan_Amount_Term,360.0,-0.005629,Negative


## Generation GPT predictions

In [39]:
_ = load_dotenv("../credentials.env")
openai.api_key = os.environ['OPENAI_API_KEY']

In [60]:
# You are a loan eligibility officer. 
# Behind our system, we use a machine learning model to predict the loan status of the applicant.
system_prompt = """
The system evaluates loan applications using applicant data. 
You need to explain the system's decision, considering features and their impacts, and this explanation is tailored for the non-technical applicant. 
No greetings or closings are necessary. 
Emphasize the features that had the most influence on the system's decision and how they affected that decision.
When you mention a feature, include the feature's name and value.
Use the term "system" to reference the model and avoid technical jargon related to the SHAP values.
"""

# Removed features
# - Married_Yes: 1 if the applicant is married, 0 otherwise
# - Education_Not Graduate: 1 if the applicant is not a graduate, 0 otherwise

query = f"""
Below are the definitions of the features:
- Dependents: Number of dependents of the applicant 
- ApplicantIncome: Income of the applicant
- CoapplicantIncome: Income of the co-applicant
- LoanAmount: Loan amount in thousands
- Loan_Amount_Term: Term of the loan in months
- Gender_Male: 1 if the applicant is a male, 0 otherwise
- Self_Employed_Yes: 1 if the applicant is self-employed, 0 otherwise
- Property_Area_Rural: 1 if the property is in a rural area, 0 otherwise
- Property_Area_Semiurban: 1 if the property is in a semiurban area, 0 otherwise
- Property_Area_Urban: 1 if the property is in an urban area, 0 otherwise
- Credit_History_1.0: 1 if the applicant has a credit history, 0 otherwise

Below are the names, values, SHAP values, and effects for each prediction in a JSON format:
{explanations_json}

Below is the prediction of the model:
Predicted status: {predicted_status}
Probability of approval: {predicted_proba}%

Based on the information on feature names, values, SHAP values, and effects, 
generate a report to explain the model's decision in simple terms.
"""

print(query)


Below are the definitions of the features:
- Dependents: Number of dependents of the applicant 
- ApplicantIncome: Income of the applicant
- CoapplicantIncome: Income of the co-applicant
- LoanAmount: Loan amount in thousands
- Loan_Amount_Term: Term of the loan in months
- Gender_Male: 1 if the applicant is a male, 0 otherwise
- Self_Employed_Yes: 1 if the applicant is self-employed, 0 otherwise
- Property_Area_Rural: 1 if the property is in a rural area, 0 otherwise
- Property_Area_Semiurban: 1 if the property is in a semiurban area, 0 otherwise
- Property_Area_Urban: 1 if the property is in an urban area, 0 otherwise
- Credit_History_1.0: 1 if the applicant has a credit history, 0 otherwise

Below are the names, values, SHAP values, and effects for each prediction in a JSON format:
[{'Name': 'Dependents', 'Value': 2.0, 'SHAP Value': 0.12153182486780074, 'Effect on Approval': 'Positive'}, {'Name': 'ApplicantIncome', 'Value': 3235.000000000001, 'SHAP Value': 0.003905107913558934, 'Ef

In [61]:
def generate_response(system_prompt, query):
	"""Generate a response to a query based on a system prompt"""
	completion = openai.chat.completions.create(
		model="gpt-3.5-turbo",
		messages=[
			{"role": "system", "content": system_prompt},
			{"role": "user", "content": query}
		]
	)

	return completion.choices[0].message.content

response = generate_response(system_prompt, query)

In [62]:
print(response)

The system has evaluated your loan application and made a decision based on various factors. Let's take a closer look at the features and their impacts:

1. Dependents: This feature represents the number of dependents you have. In your case, you have 2 dependents. This feature has a positive impact on the approval decision, meaning that having more dependents increases the likelihood of approval.

2. ApplicantIncome: This feature represents your income as the primary applicant. The system considers this feature as one of the factors for the decision. Your income of $3,235 has a positive but relatively small impact on the approval decision.

3. CoapplicantIncome: This feature represents the income of the co-applicant, if applicable. In your case, the co-applicant's income is $2,015. This feature has a significant positive impact on the approval decision, indicating that higher co-applicant income increases the chances of approval.

4. LoanAmount: This feature represents the requested lo

## Save the report

In [14]:
# Rename SHAP value to Impact Score for better understanding
explanations_df.rename(columns={"SHAP Value": "Impact Score"}, inplace=True)

# Convert my dataframe as markdown table
print(explanations_df.to_markdown())

|    | Name                    |     Value |   Impact Score | Effect on Approval   |
|---:|:------------------------|----------:|---------------:|:---------------------|
| 12 | Credit_History_1.0      |  1        |      0.534333  | Positive             |
|  2 | CoapplicantIncome       | -1.10284  |     -0.237314  | Negative             |
|  6 | Married_Yes             |  1        |      0.181702  | Positive             |
|  7 | Education_Not Graduate  |  0        |      0.119013  | Positive             |
| 10 | Property_Area_Semiurban |  0        |     -0.113641  | Negative             |
|  9 | Property_Area_Rural     |  0        |      0.106275  | Positive             |
|  1 | ApplicantIncome         |  0.510576 |      0.0998845 | Positive             |
| 11 | Property_Area_Urban     |  1        |     -0.0612738 | Negative             |
|  5 | Gender_Male             |  1        |     -0.0386152 | Negative             |
|  3 | LoanAmount              | -0.325043 |      0.032121  | Pos

In [15]:
# We will save the report as Markdown file
# The report will contains the dataframe as a table and the response from GPT-3
report = f"""
# Loan Approval Decision Report

## Applicant Information

{explanations_df.to_markdown()}

## Model Decision

{response}
"""

with open('../reports/loan_approval_decision_report.md', 'w') as f:
    f.write(report)

In [16]:
print(report)


# Loan Approval Decision Report

## Applicant Information

|    | Name                    |     Value |   Impact Score | Effect on Approval   |
|---:|:------------------------|----------:|---------------:|:---------------------|
| 12 | Credit_History_1.0      |  1        |      0.534333  | Positive             |
|  2 | CoapplicantIncome       | -1.10284  |     -0.237314  | Negative             |
|  6 | Married_Yes             |  1        |      0.181702  | Positive             |
|  7 | Education_Not Graduate  |  0        |      0.119013  | Positive             |
| 10 | Property_Area_Semiurban |  0        |     -0.113641  | Negative             |
|  9 | Property_Area_Rural     |  0        |      0.106275  | Positive             |
|  1 | ApplicantIncome         |  0.510576 |      0.0998845 | Positive             |
| 11 | Property_Area_Urban     |  1        |     -0.0612738 | Negative             |
|  5 | Gender_Male             |  1        |     -0.0386152 | Negative             |
|  3 