# Generating insights from the model's predictions

In [1]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
np.set_printoptions(precision=3, suppress=True)

from joblib import load

# Interpretability
import shap

import os
import openai
from openai import OpenAI
from dotenv import load_dotenv

# Set transformers output to Pandas DataFrame instead of NumPy array
from sklearn import set_config
set_config(transform_output="pandas")

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


## Loading and prepare the data

**Loading data**

In [2]:
X_test = pd.read_csv('../data/raw/loan-data-test.csv')
X_test.drop('Loan_ID', axis=1, inplace=True)

**Loading preprocessing pipeline and model**

In [3]:
with open('../models/model.pkl', 'rb') as f:
    model = load(f)

with open('../models/preprocessor.pkl', 'rb') as f:
    preprocessor = load(f)

**Preprocessing**

In [4]:
# Mapping number of dependents to numerical values
X_test.Dependents.replace('3+', 3, inplace=True)

# Transforming the test set
X_test = preprocessor.transform(X_test)

## Getting the SHAP values

**Unscaling the data**

In [5]:
# Get the scaler and encoder object from the pipeline
scaler = preprocessor.named_transformers_['numerical']['scaler']
encoder = preprocessor.named_transformers_['categorical']['onehot']

# Unscale the  data 
X_test_num_unscaled = scaler.inverse_transform(X_test[scaler.feature_names_in_])
X_test_num_unscaled_df = pd.DataFrame(data=X_test_num_unscaled, columns=scaler.feature_names_in_)

# Get the one-hot encoded features
X_test_cat = X_test[encoder.get_feature_names_out()]

# Reset the index before concatenation
X_test_num_unscaled_df.reset_index(drop=True, inplace=True)
X_test_cat.reset_index(drop=True, inplace=True)

# Concat the unscaled numeric data and the categorical data
X_test_unscaled = pd.concat([X_test_num_unscaled_df, X_test_cat], axis=1)

### Creating a shap explainer

In [6]:
explainer = shap.LinearExplainer(model, X_test)
shap_explanation = explainer(X_test)

# Replace the scaled values by the original ones
shap_explanation.data = X_test_unscaled.values

## Generate Insights

In [7]:
feature_names = X_test.columns
explanations_json = []
i = 0

# Predict the status
predicted_status = model.predict(X_test.iloc[[i]])

# Predict the probability
predicted_proba = np.round(
    model.predict_proba(X_test.iloc[[i]])[0][1] * 100
).astype(int)

# Get the explanation for the predicted status
for name, value, shap_value in zip(feature_names, X_test.values[i], shap_explanation.values[i]):
    explanation_json = {}
    explanation_json["Name"] = name
    explanation_json["Value"] = value
    explanation_json["SHAP Value"] = shap_value
    explanation_json["Effect on Approval"] = "Positive" if shap_value > 0 else "Negative"

    explanations_json.append(explanation_json)

# explanations_json

In [8]:
explanations_df = pd.DataFrame(explanations_json)

# Sort feature by impact score
explanations_df["SHAP Value (Abs)"] = explanations_df["SHAP Value"].abs()
explanations_df.sort_values(by="SHAP Value (Abs)", ascending=False, inplace=True)
explanations_df.drop("SHAP Value (Abs)", axis=1, inplace=True)
explanations_df

Unnamed: 0,Name,Value,SHAP Value,Effect on Approval
12,Credit_History_1.0,1.0,0.548731,Positive
2,CoapplicantIncome,-1.102837,-0.249998,Negative
6,Married_Yes,1.0,0.160357,Positive
7,Education_Not Graduate,0.0,0.119659,Positive
10,Property_Area_Semiurban,0.0,-0.117327,Negative
1,ApplicantIncome,0.510576,0.108182,Positive
9,Property_Area_Rural,0.0,0.105457,Positive
11,Property_Area_Urban,1.0,-0.070489,Negative
5,Gender_Male,1.0,-0.041878,Negative
0,Dependents,-0.827104,-0.035477,Negative


## Generation GPT predictions

In [9]:
_ = load_dotenv("../credentials.env")
openai.api_key = os.environ['OPENAI_API_KEY']

In [10]:
system_prompt = """
You are the assistant of a loan eligibility officer who doesn't know much about machine learning. 
A data scientist built a machine learning model to predict whether or not a loan applicant is eligible for a loan.
You are tasked to explain the model's predictions based on the SHAP (SHapley Additive exPlanations) values of the model's features. 
Your report should focus more on the features that most impacted the model's decision and how they impacted it.
Remember, you are explaining the model's decision to a non-technical person.
"""

query = f"""
Below are the definitions of the features:
- Dependents: Number of dependents of the applicant 
- ApplicantIncome: Income of the applicant
- CoapplicantIncome: Income of the co-applicant
- LoanAmount: Loan amount in thousands
- Loan_Amount_Term: Term of the loan in months
- Gender_Male: 1 if the applicant is a male, 0 otherwise
- Married_Yes: 1 if the applicant is married, 0 otherwise
- Education_Not Graduate: 1 if the applicant is not a graduate, 0 otherwise
- Self_Employed_Yes: 1 if the applicant is self-employed, 0 otherwise
- Property_Area_Rural: 1 if the property is in a rural area, 0 otherwise
- Property_Area_Semiurban: 1 if the property is in a semiurban area, 0 otherwise
- Property_Area_Urban: 1 if the property is in an urban area, 0 otherwise
- Credit_History_1.0: 1 if the applicant has a credit history, 0 otherwise

Below are the names, values, SHAP values, and effects for each prediction in a JSON format:
{explanations_json}

Below is the prediction of the model:
Predicted status: {predicted_status}
Probability of approval: {predicted_proba}%

Based on the information on feature names, values, SHAP values, and effects, generate a report to explain the model's decision.
"""

print(query)


Below are the definitions of the features:
- Dependents: Number of dependents of the applicant 
- ApplicantIncome: Income of the applicant
- CoapplicantIncome: Income of the co-applicant
- LoanAmount: Loan amount in thousands
- Loan_Amount_Term: Term of the loan in months
- Gender_Male: 1 if the applicant is a male, 0 otherwise
- Married_Yes: 1 if the applicant is married, 0 otherwise
- Education_Not Graduate: 1 if the applicant is not a graduate, 0 otherwise
- Self_Employed_Yes: 1 if the applicant is self-employed, 0 otherwise
- Property_Area_Rural: 1 if the property is in a rural area, 0 otherwise
- Property_Area_Semiurban: 1 if the property is in a semiurban area, 0 otherwise
- Property_Area_Urban: 1 if the property is in an urban area, 0 otherwise
- Credit_History_1.0: 1 if the applicant has a credit history, 0 otherwise

Below are the names, values, SHAP values, and effects for each prediction in a JSON format:
[{'Name': 'Dependents', 'Value': -0.8271043056395267, 'SHAP Value': -

In [11]:
def generate_response(system_prompt, query):
	"""Generate a response to a query based on a system prompt"""
	completion = openai.chat.completions.create(
		model="gpt-3.5-turbo",
		messages=[
			{"role": "system", "content": system_prompt},
			{"role": "user", "content": query}
		]
	)

	return completion.choices[0].message.content

response = generate_response(system_prompt, query)

In [12]:
print(response)

Based on the machine learning model's predictions, it has determined that the loan application is eligible for approval with a probability of 77%.

Now let's analyze the impact of each feature on the model's decision:

1. Dependents: The number of dependents that the applicant has. A negative SHAP value (-0.04) indicates that having more dependents has a slightly negative effect on the approval decision.

2. ApplicantIncome: The income of the applicant. A positive SHAP value (0.11) indicates that higher income has a positive impact on the approval decision.

3. CoapplicantIncome: The income of the co-applicant. A negative SHAP value (-0.25) suggests that a higher co-applicant income has a negative effect on the approval decision.

4. LoanAmount: The amount of the loan requested. A positive SHAP value (0.03) suggests that a larger loan amount has a slightly positive impact on the approval decision.

5. Loan_Amount_Term: The term of the loan in months. A negative SHAP value (-0.007) sugg

## Save the report

In [19]:
# Rename SHAP value to Impact Score for better understanding
explanations_df.rename(columns={"SHAP Value": "Impact Score"}, inplace=True)

# Convert my dataframe as markdown table
print(explanations_df.to_markdown())

|    | Name                    |     Value |   Impact Score | Effect on Approval   |
|---:|:------------------------|----------:|---------------:|:---------------------|
| 12 | Credit_History_1.0      |  1        |     0.548731   | Positive             |
|  2 | CoapplicantIncome       | -1.10284  |    -0.249998   | Negative             |
|  6 | Married_Yes             |  1        |     0.160357   | Positive             |
|  7 | Education_Not Graduate  |  0        |     0.119659   | Positive             |
| 10 | Property_Area_Semiurban |  0        |    -0.117327   | Negative             |
|  1 | ApplicantIncome         |  0.510576 |     0.108182   | Positive             |
|  9 | Property_Area_Rural     |  0        |     0.105457   | Positive             |
| 11 | Property_Area_Urban     |  1        |    -0.0704887  | Negative             |
|  5 | Gender_Male             |  1        |    -0.0418783  | Negative             |
|  0 | Dependents              | -0.827104 |    -0.0354772  | Neg

In [20]:
# We will save the report as Markdown file
# The report will contains the dataframe as a table and the response from GPT-3
report = f"""
# Loan Approval Decision Report

## Applicant Information

{explanations_df.to_markdown()}

## Model Decision

{response}
"""

with open('../reports/loan_approval_decision_report.md', 'w') as f:
    f.write(report)

In [21]:
print(report)


# Loan Approval Decision Report

## Applicant Information

|    | Name                    |     Value |   Impact Score | Effect on Approval   |
|---:|:------------------------|----------:|---------------:|:---------------------|
| 12 | Credit_History_1.0      |  1        |     0.548731   | Positive             |
|  2 | CoapplicantIncome       | -1.10284  |    -0.249998   | Negative             |
|  6 | Married_Yes             |  1        |     0.160357   | Positive             |
|  7 | Education_Not Graduate  |  0        |     0.119659   | Positive             |
| 10 | Property_Area_Semiurban |  0        |    -0.117327   | Negative             |
|  1 | ApplicantIncome         |  0.510576 |     0.108182   | Positive             |
|  9 | Property_Area_Rural     |  0        |     0.105457   | Positive             |
| 11 | Property_Area_Urban     |  1        |    -0.0704887  | Negative             |
|  5 | Gender_Male             |  1        |    -0.0418783  | Negative             |
|  0 