# Generating insights from the model's predictions

In [35]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
np.set_printoptions(precision=3, suppress=True)

from joblib import load

# Interpretability
import shap

import os
import openai
from openai import OpenAI
from dotenv import load_dotenv

# Set transformers output to Pandas DataFrame instead of NumPy array
from sklearn import set_config
set_config(transform_output="pandas")

## Loading and prepare the data

**Loading data**

In [5]:
X_test = pd.read_csv('../data/raw/loan-data-test.csv')
X_test.drop('Loan_ID', axis=1, inplace=True)

**Loading preprocessing pipeline and model**

In [6]:
with open('../models/model.pkl', 'rb') as f:
    model = load(f)

with open('../models/preprocessor.pkl', 'rb') as f:
    preprocessor = load(f)

**Preprocessing**

In [7]:
# Mapping number of dependents to numerical values
X_test.Dependents.replace('3+', 3, inplace=True)

# Transforming the test set
X_test = preprocessor.transform(X_test)

## Getting the SHAP values

**Unscaling the data**

In [8]:
# Get the scaler and encoder object from the pipeline
scaler = preprocessor.named_transformers_['numerical']['scaler']
encoder = preprocessor.named_transformers_['categorical']['onehot']

# Unscale the  data 
X_test_num_unscaled = scaler.inverse_transform(X_test[scaler.feature_names_in_])
X_test_num_unscaled_df = pd.DataFrame(data=X_test_num_unscaled, columns=scaler.feature_names_in_)

# Get the one-hot encoded features
X_test_cat = X_test[encoder.get_feature_names_out()]

# Reset the index before concatenation
X_test_num_unscaled_df.reset_index(drop=True, inplace=True)
X_test_cat.reset_index(drop=True, inplace=True)

# Concat the unscaled numeric data and the categorical data
X_test_unscaled = pd.concat([X_test_num_unscaled_df, X_test_cat], axis=1)

### Creating a shap explainer

In [9]:
explainer = shap.LinearExplainer(model, X_test)
shap_explanation = explainer(X_test)

# Replace the scaled values by the original ones
shap_explanation.data = X_test_unscaled.values

## Generate Insights

In [48]:
feature_names = X_test.columns
explanations_json = []
i = 0

# Predict the status
predicted_status = model.predict(X_test.iloc[[i]])

# Predict the probability
predicted_proba = model.predict_proba(X_test.iloc[[i]])[0][1]

# Get the explanation for the predicted status
for name, value, shap_value in zip(feature_names, X_test.values[i], shap_explanation.values[i]):
    explanation_json = {}
    explanation_json["Feature name"] = name
    explanation_json["Value"] = value
    explanation_json["Score"] = shap_value

    if shap_value > 0:
        explanation_json["Effect on Approval"] = "Increase"
    else:
        explanation_json["Effect on Approval"] = "Decrease"

    explanations_json.append(explanation_json)

# explanations_json

In [24]:
predicted_proba

0.7657376776609758

In [55]:
explanations_df = pd.DataFrame(explanations_json)

# Sort feature by impact score
explanations_df["Score (Abs)"] = explanations_df["Score"].abs()
explanations_df.sort_values(by="Score (Abs)", ascending=False, inplace=True)
explanations_df.drop('Score (Abs)', axis=1, inplace=True)

explanations_df

Unnamed: 0,Feature name,Value,Score,Effect on Approval
12,Credit_History_1.0,1.0,0.548731,Increase
2,CoapplicantIncome,-1.102837,-0.249998,Decrease
6,Married_Yes,1.0,0.160357,Increase
7,Education_Not Graduate,0.0,0.119659,Increase
10,Property_Area_Semiurban,0.0,-0.117327,Decrease
1,ApplicantIncome,0.510576,0.108182,Increase
9,Property_Area_Rural,0.0,0.105457,Increase
11,Property_Area_Urban,1.0,-0.070489,Decrease
5,Gender_Male,1.0,-0.041878,Decrease
0,Dependents,-0.827104,-0.035477,Decrease


## Generation GPT predictions

In [37]:
_ = load_dotenv("../credentials.env")
openai.api_key = os.environ['OPENAI_API_KEY']

In [50]:
explanations_json

[{'Feature name': 'Dependents',
  'Value': -0.8271043056395267,
  'Score': -0.03547715913485509,
  'Effect on Approval': 'Decrease'},
 {'Feature name': 'ApplicantIncome',
  'Value': 0.5105760017940986,
  'Score': 0.10818185353730546,
  'Effect on Approval': 'Increase'},
 {'Feature name': 'CoapplicantIncome',
  'Value': -1.1028368386382617,
  'Score': -0.24999798202926116,
  'Effect on Approval': 'Decrease'},
 {'Feature name': 'LoanAmount',
  'Value': -0.3250427419068526,
  'Score': 0.026137169856863533,
  'Effect on Approval': 'Increase'},
 {'Feature name': 'Loan_Amount_Term',
  'Value': 0.17554003661788928,
  'Score': -0.006982345180569146,
  'Effect on Approval': 'Decrease'},
 {'Feature name': 'Gender_Male',
  'Value': 1.0,
  'Score': -0.04187830975371162,
  'Effect on Approval': 'Decrease'},
 {'Feature name': 'Married_Yes',
  'Value': 1.0,
  'Score': 0.16035732394109922,
  'Effect on Approval': 'Increase'},
 {'Feature name': 'Education_Not Graduate',
  'Value': 0.0,
  'Score': 0.119

In [59]:
system_prompt = """
You are the assistant of a loan eligibility officer who doesn't know much about machine learning. 
A data scientist built a machine learning model to predict whether or not a loan applicant is eligible for a loan.
You are tasked to explain the model's predictions based on the SHAP values of the model's features. 
Your report should focus more on the features that most impacted the model's decision, and how they impacted it.
Remember, you are explaining the model's decision to a non-technical person.
"""

query = f""""
Here are the features of the model:
- Dependents: Number of dependents of the applicant 
- ApplicantIncome: Income of the applicant
- CoapplicantIncome: Income of the co-applicant
- LoanAmount: Loan amount in thousands
- Loan_Amount_Term: Term of the loan in months
- Gender_Male: 1 if the applicant is a male, 0 otherwise
- Married_Yes: 1 if the applicant is married, 0 otherwise
- Education_Not Graduate: 1 if the applicant is not a graduate, 0 otherwise
- Self_Employed_Yes: 1 if the applicant is self-employed, 0 otherwise
- Property_Area_Rural: 1 if the property is in a rural area, 0 otherwise
- Property_Area_Semiurban: 1 if the property is in a semiurban area, 0 otherwise
- Property_Area_Urban: 1 if the property is in an urban area, 0 otherwise
- Credit_History_1.0: 1 if the applicant has a credit history, 0 otherwise

Here are the expanations for the applicant:
{explanation_json}

Note that the "Score" is simply the SHAP value of the feature. 

Here is what the model predicted:
Predicted status: {predicted_status}
Probability of approval: {predicted_proba}

Base on this information, generate a report to explain the model's decision.
"""

In [60]:
def generate_response(system_prompt, query):
	"""Generate a response to a query based on a system prompt"""
	completion = openai.chat.completions.create(
		model="gpt-3.5-turbo",
		messages=[
			{"role": "system", "content": system_prompt},
			{"role": "user", "content": query}
		]
	)

	return completion.choices[0].message.content

response = generate_response(system_prompt, query)

In [None]:
print(response)

Based on the analysis of the loan application, the model predicts that the applicant is eligible for a loan. The most influential factor in this decision is the applicant's credit history. 

The model considers the applicant's credit history to be an important indicator of their reliability when it comes to repaying the loan. In this case, the applicant has a credit history, which greatly increases the chances of loan approval. 

The model assigns a score to each feature, called the SHAP value, which represents the impact of that feature on the final decision. A higher SHAP value indicates a stronger influence on the decision. In this case, the credit history of the applicant has the highest SHAP value, indicating its significant role in the model's decision-making process.

Based on the prediction of the model, the applicant has a 76.6% chance of loan approval. This probability is a result of various factors, with the credit history being the most influential one.

It is important to 

## Save the report

In [None]:
# Convert my dataframe as markdown table
explanations_df.to_markdown()

In [None]:
# We will save the report as Markdown file
# The report will contains the dataframe as a table and the response from GPT-3
report = f"""
# Loan Approval Decision Report

## Applicant Information

{explanations_df.to_markdown()}

## Model Decision

{response}
"""

with open('../reports/loan_approval_decision_report.md', 'w') as f:
    f.write(report)