# Réalisez un dashboard
## Notebook - Test de l'API  
OpenClassrooms - Parcours Data Scientist - Projet 08  

In [1]:
import pandas as pd
import numpy as np
import requests
import json
import os
import pickle
# import matplotlib.pyplot as plt

In [None]:
# Graphique des valeurs SHAP
def plot_shap_values(local_importance, feature_names, top_n=20):
    # Créer un DataFrame à partir des valeurs SHAP et des noms des features
    shap_values_df = pd.DataFrame({'Feature': feature_names, 'SHAP Value': local_importance})

    # Trier par la valeur absolue des valeurs SHAP et sélectionner les top N
    shap_values_df['Abs SHAP Value'] = np.abs(shap_values_df['SHAP Value'])
    shap_values_df = shap_values_df.sort_values(by='Abs SHAP Value', ascending=False).head(top_n)

    # Tracer les valeurs SHAP
    plt.figure(figsize=(10, 8))
    plt.barh(shap_values_df['Feature'], shap_values_df['SHAP Value'])
    plt.xlabel('SHAP Value')
    plt.title(f'Top {top_n} Feature importance locale (SHAP values)')
    plt.gca().invert_yaxis()
    plt.show()

In [None]:
# Graphique de comparaison des features importance locale et globale
def compare_feature_importance(local_importance, global_importance_df, top_n=20):
    # Convertir les valeurs SHAP locales en DataFrame
    local_importance_df = pd.DataFrame({'Feature': global_importance_df['Feature'], 'Local importance': local_importance})
    
    # Joindre les DataFrames d'importance locale et globale
    comparison_df = global_importance_df.set_index('Feature').join(local_importance_df.set_index('Feature'))
    
    # Trier par la valeur absolue de l'importance locale et sélectionner les top N
    comparison_df['Abs Local importance'] = np.abs(comparison_df['Local importance'])
    comparison_df = comparison_df.sort_values(by='Abs Local importance', ascending=False).head(top_n)
    
    # Tracer la comparaison
    fig, ax = plt.subplots(figsize=(10, 8))
    comparison_df[['Global importance', 'Local importance']].plot(kind='barh', ax=ax)
    ax.set_xlabel('Importance')
    ax.set_title('Comparaison de l\'importance des features (Globale vs Locale)')
    plt.gca().invert_yaxis()
    plt.show()


In [2]:
# Nombre d'exemples à tester
n_samples = 10

In [3]:
# Récupération des données de test
base_dir = os.getcwd()
df_test_path = os.path.join(base_dir, '..', '..', 'P08 - Dashboard', 'data', 'raw', 'application_test.csv')
df_test = pd.read_csv(df_test_path)

In [4]:
# Récupération du seuil de classification
threshold_path = os.path.join(base_dir, '..', 'data', 'processed', 'best_threshold.txt')
with open(threshold_path, 'r') as threshold_file:
    best_threshold = float(threshold_file.read())

In [5]:
# Sélection aléatoire de n_samples exemples
df_subset = df_test.sample(n_samples)

In [6]:
# Récupération des id clients
sk_id_curr = df_subset['SK_ID_CURR'].astype('int')

In [7]:
# Suppression des id clients
df_subset = df_subset.drop(columns='SK_ID_CURR')

In [8]:
# Remplacement des valeurs manquantes par None (JSON n'accepte pas les NaN)
df_subset =  df_subset.map(lambda x: None if pd.isna(x) else x)

In [9]:
# Conversion au format JSON
data_json = json.dumps({
    "columns": df_subset.columns.tolist(),
    "data": df_subset.values.tolist()}
)

In [10]:
# Envoi de la requête POST à l'API
heroku_url = 'https://failurescore-bc9f53f25e58.herokuapp.com/predict'
local_url = 'http://127.0.0.1:5000/predict'
response = requests.post(
    heroku_url,
    headers={'Content-Type': 'application/json'},
    data=data_json
)

In [11]:
display(response)

<Response [200]>

In [12]:
# DataFrame pour affichage des probabilités et accord ou non de crédit
predictions = pd.DataFrame({'Customer id': sk_id_curr.values.tolist(), 'Failure probability': response.json()['prediction_proba']})
predictions['Credit agreement'] = predictions['Failure probability'] < best_threshold
predictions['Failure probability'] = round(predictions['Failure probability'], 2)

In [13]:
# Affichage
display(predictions)

Unnamed: 0,Customer id,Failure probability,Credit agreement
0,157804,0.33,True
1,241965,0.52,True
2,125111,0.38,True
3,407839,0.3,True
4,196436,0.43,True
5,241804,0.42,True
6,270324,0.54,False
7,274155,0.07,True
8,175402,0.18,True
9,411938,0.31,True


In [None]:
local_importance = response.json()['feature_importance']

In [None]:
feature_names = response.json()['feature_names']

In [None]:
local_importance

In [None]:
# Graphique feature importance locale
plot_shap_values(local_importance[0], feature_names)

In [17]:
~0

-1

In [18]:
~1

-2

In [20]:
~(1>0)

  ~(1>0)


-2