<a href="https://colab.research.google.com/github/FranciscoOcampoPredictiva/azureml_course/blob/main/Lecture_1_Model_Explainer_on_Local_Machine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1 - Installation an setup

In [None]:
# Install azureml interpret package
! pip install -q azureml-interpret

# Step 2 - Training the ML model

In [None]:
# Import the pandas
import pandas as pd

# Read dataset
df = pd.read_csv('/content/Churn_Modelling.csv')

In [None]:
df = df.drop(columns=['RowNumber', 'CustomerId', 'Surname'])

In [None]:
# Dummy variables
df = pd.get_dummies(df, drop_first=True)

In [None]:
# Define X and Y Variables
X = df.drop(columns='Exited')
Y = df['Exited']

In [None]:
# Split the X and Y into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0, stratify=Y)

In [None]:
# Train the ML model
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=0)
trained_ml_model = classifier.fit(X_train, Y_train)

In [None]:
# Predict the results
y_pred = classifier.predict(X_test)

In [None]:
# Model evaluation
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, y_pred)
score = classifier.score(X_test, Y_test)
print(cm)
print(score)

[[2299   90]
 [ 341  270]]
0.8563333333333333


# Step 3 - Model explainers

## Global explanations

In [None]:
from interpret_community.tabular_explainer import TabularExplainer

In [None]:
# help(TabularExplainer)

In [None]:
df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,0


In [None]:
# Define variables
classes = ['Exited', 'Not Exited']
features = list(X.columns)

In [None]:
# tabular explainer object

tab_explainer = TabularExplainer(model=trained_ml_model,
                                 initialization_examples=X_train,
                                 features=features,
                                 classes=classes)

In [None]:
# Global explainations
global_explaination = tab_explainer.explain_global(X_train)

In [None]:
# feature importance data
global_feature_imp = global_explaination.get_feature_importance_dict()

In [None]:
global_feature_imp

{'Age': 0.11569068889397732,
 'NumOfProducts': 0.08005822342620106,
 'IsActiveMember': 0.05316461383325616,
 'Geography_Germany': 0.03189695697579134,
 'Balance': 0.0281066806257185,
 'Gender_Male': 0.027968630058755946,
 'EstimatedSalary': 0.01950655967810963,
 'CreditScore': 0.019100968001856518,
 'Tenure': 0.01330174404392769,
 'Geography_Spain': 0.0063662267453276165,
 'HasCrCard': 0.006194165250576628}

## Local explanations

In [None]:
# five observations
feature_explain = X_test[0:5]

In [None]:
feature_explain

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
3714,823,34,3,105057.33,1,1,0,9217.92,0,0,1
7308,663,53,6,150200.23,1,0,1,151317.27,0,1,0
7156,632,65,6,129472.33,1,1,1,85179.48,1,0,1
2572,711,58,9,91285.13,2,1,1,26767.85,0,0,1
2711,608,28,9,0.0,2,1,1,125062.02,0,0,0


In [None]:
# Local explanation object
local_explanation = tab_explainer.explain_local(feature_explain)

In [None]:
# extract feature names and importance value
local_features = local_explanation.get_ranked_local_names() # feature names
local_importance = local_explanation.get_ranked_local_values() # corrosponding importance values

In [None]:
print(local_features)
# list of classes
# records
# names

[[['Age', 'Gender_Male', 'Geography_Germany', 'Balance', 'Tenure', 'Geography_Spain', 'HasCrCard', 'EstimatedSalary', 'NumOfProducts', 'CreditScore', 'IsActiveMember'], ['IsActiveMember', 'Geography_Spain', 'Geography_Germany', 'CreditScore', 'Tenure', 'NumOfProducts', 'HasCrCard', 'Balance', 'Gender_Male', 'EstimatedSalary', 'Age'], ['IsActiveMember', 'HasCrCard', 'EstimatedSalary', 'Gender_Male', 'Tenure', 'Geography_Spain', 'CreditScore', 'NumOfProducts', 'Balance', 'Geography_Germany', 'Age'], ['IsActiveMember', 'NumOfProducts', 'Gender_Male', 'CreditScore', 'Geography_Germany', 'EstimatedSalary', 'Tenure', 'HasCrCard', 'Geography_Spain', 'Balance', 'Age'], ['Age', 'NumOfProducts', 'IsActiveMember', 'Geography_Germany', 'Balance', 'EstimatedSalary', 'Tenure', 'Geography_Spain', 'CreditScore', 'HasCrCard', 'Gender_Male']], [['IsActiveMember', 'CreditScore', 'NumOfProducts', 'EstimatedSalary', 'HasCrCard', 'Geography_Spain', 'Tenure', 'Balance', 'Geography_Germany', 'Gender_Male', 'A

In [None]:
print(local_importance)

[[[0.13311105086909705, 0.03686916287488229, 0.032613749768319725, 0.016073260694934192, 0.015483477768264054, -0.0011664072431800856, -0.0032769090409888227, -0.009009549511677864, -0.025340646756049832, -0.0393603267496369, -0.04195400553110867], [0.147186757757026, 0.034840501743273355, 0.03202028031122659, 0.013240776952768646, 0.007166744828203445, -0.007694366141858893, -0.01141777604672139, -0.023916426823660998, -0.025938798170547277, -0.0380988732240131, -0.2433459640428394], [0.13206855911958343, 0.01279653983574506, 0.0125867986734373, 0.009131576514784533, 0.008435256715377435, -0.009633113347106401, -0.01192648475867878, -0.0431783596256372, -0.05263617405911316, -0.12536950979854514, -0.19823223212699126], [0.12858831466484646, 0.0804256956690939, 0.051105248484034506, 0.01808398850814837, 0.010988945105780152, 0.005243743935099556, 0.0028851525363816238, 0.0028653691586665877, -0.00473653034443572, -0.010693671528021921, -0.19071339904673718], [0.07750453309098854, 0.067

In [None]:
y_pred[0:5]

array([0, 0, 0, 0, 0])

In [None]:
# Print the local explanations
for i in range(0, len(local_features)):
    labels = local_features[i]
    print("\n Feature suppport values for : ", classes[i])

    for j in range(0, len(labels)):

        if y_pred[j] == i:
            print("\n\tObservation number : ", j + 1)
            feature_names = labels[j]

            print("\t\t", "Feature Name".ljust(30), "  Value")
            print("\t\t", "-"*30, "-"*10)

            for k in range(0, len(feature_names)):
                print("\t\t", feature_names[k].ljust(30), round(local_importance[i][j][k], 6))


 Feature suppport values for :  Exited

	Observation number :  1
		 Feature Name                     Value
		 ------------------------------ ----------
		 Age                            0.133111
		 Gender_Male                    0.036869
		 Geography_Germany              0.032614
		 Balance                        0.016073
		 Tenure                         0.015483
		 Geography_Spain                -0.001166
		 HasCrCard                      -0.003277
		 EstimatedSalary                -0.00901
		 NumOfProducts                  -0.025341
		 CreditScore                    -0.03936
		 IsActiveMember                 -0.041954

	Observation number :  2
		 Feature Name                     Value
		 ------------------------------ ----------
		 IsActiveMember                 0.147187
		 Geography_Spain                0.034841
		 Geography_Germany              0.03202
		 CreditScore                    0.013241
		 Tenure                         0.007167
		 NumOfProducts                  -0.007694