In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm, metrics, tree, decomposition, svm
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, Perceptron, SGDClassifier, OrthogonalMatchingPursuit
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import plot_confusion_matrix, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.model_selection import train_test_split, ParameterGrid, KFold, StratifiedKFold
from sklearn.dummy import DummyClassifier
from sklearn import tree

import matplotlib.pyplot as plt

import yaml
import pickle
from statistics import mean
import random

import seaborn as sns

import shap

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


import warnings
warnings.filterwarnings('ignore')

## Inputs

In [None]:
X = pd.read_csv("matrices/por_X.csv",index_col=0)
y = pd.read_csv("matrices/por_y.csv",index_col=0,squeeze = True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Tree-based Models

## Model

In [None]:
clf_params = {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100, 'n_jobs': -1}	
clf = RandomForestClassifier(**clf_params,random_state=42).fit(X_train, y_train)

In [None]:
clf_params = {'max_depth': 3}
clf = DecisionTreeClassifier(**clf_params,random_state=42).fit(X_train, y_train)

## Feature importances

In [None]:
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values[1], X_test)

In [None]:
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)

tree.plot_tree(clf,
               feature_names = X_test.columns, 
               class_names=['Pass','Fail'],
               filled = True);

In [None]:
temp = pd.DataFrame()
temp['names'] = X_test.columns
a = np.array(shap_values[1])
a = np.abs(a)
temp['abs_vals'] = np.mean(a, axis=0)
temp = temp.sort_values(by=['abs_vals'],ascending=False)
temp.head(5)

In [None]:
top_features = temp['names'].to_list()[0:5]
top_features

## Get profiles

In [None]:
def get_random_index(np_arr):
    passed = 0
    while passed == 0:
        i = random.randint(0,len(np_arr)-1)
        if np_arr[i] == 1:
            return i

In [None]:
# Get profiles

# Get threshold
y_pred_proba = clf.predict_proba(X_test)[:,1]
k = 25
threshold = np.percentile(y_pred_proba,(100-k))
print(threshold)
threshold_high = np.percentile(y_pred_proba,90)
print(threshold_high)
threshold_low = np.percentile(y_pred_proba,1)
print(threshold_low)


# Get index of 2 profiles close to threshold
next_score = min(y_pred_proba[y_pred_proba > threshold])
print(next_score)
arr = np.where((y_pred_proba == next_score), 1, 0)
i_above_threshold = get_random_index(arr)

prev_score = max(y_pred_proba[y_pred_proba < threshold])
print(prev_score)
arr = np.where((y_pred_proba == prev_score), 1, 0)
i_below_threshold = get_random_index(arr)

# Get index of low profile
arr = np.where((y_pred_proba <= threshold_low), 1, 0)
i_low = get_random_index(arr)

# Get index of high profile
arr = np.where((y_pred_proba >= threshold_high), 1, 0)
i_high = get_random_index(arr)

print(i_above_threshold)
print(i_below_threshold)
print(i_low)
print(i_high)

In [None]:
print(f"Slightly above threshold | {i_above_threshold}")
X_test.filter(items=top_features).iloc[i_above_threshold]

In [None]:
feat_names_long = ['Number of absences',
 'Number of failures',
 'First year grade',
 'traveltime',
 'Study time',
 'Quality of family relationships',
 'freetime',
 'Frequency of going out with friends',
 'Workday alcohol consumption',
 'Weekly alcoholic drinks',
 'Health status',
 'activities',
 'nursery',
 'Has internet',
 'Has romantic relationship',
 'Attends school Gabriel Pereira',
 'address_urban',
 'Family size greater than 3',
 'Pstatus_togethor',
 'sex_female',
 'health_mjob',
 'other_mjob',
 'services_mjob',
 'teacher_mjob',
 'health_fjob',
 "Father's job not listed on survey",
 "Father's job in services",
 "Father's job is teacher",
 'home_reason',
 'other_reason',
 'School chosen based on reputation',
 'mother_guardian',
 'Guardian not mother or father']

In [None]:
shap.waterfall_plot(shap.Explanation(values=shap_values[1][i_above_threshold], 
                                     base_values=explainer.expected_value[1], 
                                     data=X_test.iloc[i_above_threshold],
                                     feature_names=feat_names_long),max_display=(len(top_features)+1))

In [None]:
print(f"Slightly below threshold | ID {i_below_threshold}")
X_test.filter(items=top_features).iloc[i_below_threshold]

In [None]:
shap.waterfall_plot(shap.Explanation(values=shap_values[1][i_below_threshold], 
                                     base_values=explainer.expected_value[1], 
                                     data=X_test.iloc[i_below_threshold],
                                     feature_names=feat_names_long),max_display=(len(top_features)+1))

In [None]:
print(f"High likelihood of failure | ID {i_high}")
X_test.filter(items=top_features).iloc[i_high]

In [None]:
shap.waterfall_plot(shap.Explanation(values=shap_values[1][i_high], 
                                     base_values=explainer.expected_value[1], 
                                     data=X_test.iloc[i_high],
                                     feature_names=feat_names_long),max_display=(len(top_features)+1))

In [None]:
print(f"High likelihood of passing | ID {i_low}")
X_test.filter(items=top_features).iloc[i_low]

In [None]:
shap.waterfall_plot(shap.Explanation(values=shap_values[1][i_low], 
                                     base_values=explainer.expected_value[1], 
                                     data=X_test.iloc[i_low],
                                     feature_names=feat_names_long),max_display=(len(top_features)+1))

# Linear model

In [None]:
clf_params = {'alpha': 0.01}
clf = Lasso(**clf_params,random_state=42).fit(X_train, y_train)

In [None]:
clf.coef_

In [None]:
temp = pd.DataFrame()
temp['coef'] = clf.coef_
temp['names'] = X_train.columns
temp['coef_abs'] = abs(clf.coef_)
temp = temp.sort_values(by=['coef_abs'],ascending=False)
temp = temp[temp.coef != 0]
top_features = temp['names'].to_list()

In [None]:
temp.head(15)

In [None]:
sns.barplot(x='coef',y='names',data=temp)

## Get profiles

In [None]:
# Get profiles

# Get threshold
y_pred_proba = clf.predict(X_test)
k = 25
threshold = np.percentile(y_pred_proba,(100-k))
print(threshold)
threshold_high = np.percentile(y_pred_proba,90)
print(threshold_high)
threshold_low = np.percentile(y_pred_proba,1)
print(threshold_low)


# Get index of 2 profiles close to threshold
next_score = min(y_pred_proba[y_pred_proba > threshold])
print(next_score)
arr = np.where((y_pred_proba == next_score), 1, 0)
i_above_threshold = get_random_index(arr)

prev_score = max(y_pred_proba[y_pred_proba < threshold])
print(prev_score)
arr = np.where((y_pred_proba == prev_score), 1, 0)
i_below_threshold = get_random_index(arr)

# Get index of low profile
arr = np.where((y_pred_proba <= threshold_low), 1, 0)
i_low = get_random_index(arr)

# Get index of high profile
arr = np.where((y_pred_proba >= threshold_high), 1, 0)
i_high = get_random_index(arr)

print(i_above_threshold)
print(i_below_threshold)
print(i_low)
print(i_high)

In [None]:
print(f"Slightly above threshold | {i_above_threshold}")
X_test.filter(items=top_features).iloc[i_above_threshold]

In [None]:
print(f"Slightly below threshold | ID {i_below_threshold}")
X_test.filter(items=top_features).iloc[i_below_threshold]

In [None]:
print(f"High likelihood of failure | ID {i_high}")
X_test.filter(items=top_features).iloc[i_high]

In [None]:
print(f"High likelihood of failure | ID {i_low}")
X_test.filter(items=top_features).iloc[i_low]

## Deprecated

In [None]:
# profiles:
# i_above_threshold = 28
# i_below_threshold = 142
# i_low = 14
# i_high = 190

In [None]:
y_pred_proba

In [None]:
X_test.iloc[i_below_threshold]

In [None]:
i = get_random_index(arr)
print(i)

In [None]:
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values[1], X_test, plot_type='bar')

In [None]:
temp = pd.DataFrame()
temp['names'] = X_test.columns
a = np.array(shap_values[1])
a = np.abs(a)
temp['abs_vals'] = np.mean(a, axis=0)
temp = temp.sort_values(by=['abs_vals'],ascending=False)
temp.head(10)

In [None]:
'''
y_pred_proba = rf.predict_proba(X_test)[:,1]
k = 25
y_pred = np.where(y_pred_proba > np.percentile(y_pred_proba,(100-k)), 1, 0)
i = np.where(y_pred == 1)

temp = pd.DataFrame()
temp['names'] = X_train.columns
a = np.array(shap_values[1][i])
temp['vals'] = np.mean(a, axis=0)
temp['abs_vals'] = temp['vals'].abs()
temp = temp.sort_values(by=['abs_vals'],ascending=False)
temp.head()

sns.barplot(x='vals', y='names',data=temp[0:10])
'''

In [None]:
shap.waterfall_plot(shap.Explanation(values=shap_values[1][i_low], 
                                     base_values=explainer.expected_value[1], 
                                     data=X_test.iloc[i_below_threshold],
                                     feature_names=X_train.columns.tolist()))