In [47]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from lime import lime_tabular

import pickle

In [48]:
data_train = pd.read_csv("../data/alt_maccsfp_after_preprocessing.csv")

y = data_train['ALT']
del data_train['ALT']
X = data_train.values
features = data_train.columns.values
y = y.values

cv_outer = KFold(n_splits=5, shuffle=True, random_state=234)

models = []
best_predicted_explanations = []
worst_predicted_explanations = []


folder_path = "../best_model_analysis"

In [73]:
def draw_actual_vs_predicted_plot(folder_path, number, y_test, y_pred_test, result):
    x = np.arange(start=0, stop=len(y_test), step=1)
    y_test = np.expm1(y_test)
    y_pred_test = np.expm1(y_pred_test)
    fontsize = 15
    plt.figure(figsize=(10, 6))
    plt.plot(x, y_test, 'go', label="ALT", c = "tab:blue")
    plt.plot(x, y_pred_test, 'bD', label="ALT - predicted", c = "tab:orange")
    plt.title(f"PREDICTED VS ACTUAL", fontsize=fontsize)
    plt.suptitle("R2 = {:.3f}".format(result), fontsize=fontsize)
    plt.xlabel("SAMPLE NUMBER", fontsize=fontsize)
    plt.ylabel("ALT  VALUE", fontsize=fontsize)
    plt.legend(loc="best", fontsize=fontsize)
    plt.xticks(range(0, 20))
    plt.yticks(range(0, 300, 25))
    plt.grid()
    plt.savefig(f'{folder_path}/actual_vs_predicted_{number}.jpg')
    plt.show()
    
    
def draw_actual_vs_predicted_parity_plot(folder_path, number, y_test, y_pred, result, text):
    x = np.arange(start=0, stop=len(y_test), step=1)
    y_test = np.expm1(y_test)
    y_pred = np.expm1(y_pred)
    fontsize = 15
    plt.figure(figsize=(8, 8))
    plt.plot(y_test, y_pred, 'bD', label="ALT - predicted", c = "tab:orange")
    plt.plot(y_test, y_test, 'k-', label="ALT - predicted", c = "tab:blue")
    plt.title(f"PREDICTED VS ACTUAL", fontsize=fontsize)
    plt.suptitle(f" {text} \n R2 ACCURACY = {result}  ", fontsize=fontsize)
    plt.xlabel("ALT VALUE", fontsize=fontsize)
    plt.ylabel("PREDICTED ALT VALUE", fontsize=fontsize)
    plt.savefig(f'{folder_path}/parity_plot_{text}_actual_vs_predicted_{number}.jpg')
    plt.show()

    
def explanation_worst_vs_best_predictions(model, x_train, x_test, y_test, y_pred_test, features):
    """
    Compare worst vs best predictions - check for differences in significant fingerprints
    """
    sorted_absolute_error_idx = np.argsort(np.absolute(np.subtract(y_test, y_pred_test)))
    explainer = lime_tabular.LimeTabularExplainer(x_train, mode="regression",
                                                  feature_names=features)
    
    explanations_best = []
    explanations_worst = []
    
    for i in range(0,3):
        explanation_best = explainer.explain_instance(x_test[sorted_absolute_error_idx[i]],
                                                                model.predict,
                                                                num_features=10)

        explanation_worst = explainer.explain_instance(
            x_test[sorted_absolute_error_idx[::-1][i]], model.predict,
            num_features=5)
        explanations_best.append(explanation_best.as_list())
        explanations_worst.append(explanation_worst.as_list())
    
    
    return explanations_best, explanations_worst
    

def model_explanation_generate_html_for_best_and_worst_model(folder_path, number, model, x_train, x_test, y_test, y_pred_test, features):
    """
    Function generates html reports using LIME framework (https://github.com/marcotcr/lime)
    for best and worst prediction
    """
    sorted_absolute_error_idx = np.argsort(np.absolute(np.subtract(y_test, y_pred_test)))
    explainer = lime_tabular.LimeTabularExplainer(x_train, mode="regression",
                                                  feature_names=features)
    explanation_best_predicted = explainer.explain_instance(x_test[sorted_absolute_error_idx[0]],
                                                            model.predict,
                                                            num_features=len(features))

    explanation_worst_predicted = explainer.explain_instance(
        x_test[sorted_absolute_error_idx[::-1][0]], model.predict,
        num_features=len(features))
    explanation_best_predicted.save_to_file(
        f"{folder_path}/explanation_best_prediction_{number}.html")
    explanation_worst_predicted.save_to_file(
        f"{folder_path}/explanation_worst_prediction_{number}.html")

    return explanation_best_predicted.as_list(), explanation_worst_predicted.as_list()


def explanations_random_predictions(model, x_train, x_test, features):
    """
    Generate list of list of explanations for random predictions
    """
    explainer = lime_tabular.LimeTabularExplainer(x_train, mode="regression", feature_names=features)
    
    explanations = []
    
    for i in [2,4,5,8,12]:
        explanation = explainer.explain_instance(x_test[i], model.predict, num_features=len(features))
        explanations.append(explanation.as_list())
    
    
    return explanations

In [78]:
explanations_rank = []

for i, (train_index, test_index) in enumerate(cv_outer.split(X)):
    # split data
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
#     loaded_model = pickle.load(
#         open(f"../models/finalized_GradienBoostingRegressor_model_{i}.pickle", 'rb'))
    loaded_model = pickle.load(
        open(f"../models/finalized_RandomForestRegressor_model_{i}.pickle", 'rb'))
    models.append(loaded_model)
    y_pred_test = loaded_model.predict(x_test)
    y_pred_train = loaded_model.predict(x_train)
    result = round(loaded_model.score(x_test, y_test), 3)
    result_train = round(loaded_model.score(x_train, y_train), 3)
#     draw_actual_vs_predicted_plot(folder_path, i, y_test, y_pred_test, result)
#     draw_actual_vs_predicted_parity_plot(folder_path, i, y_test, y_pred_test, result, f"Random Forest Regressor ")
#     draw_actual_vs_predicted_parity_plot(folder_path, i, y_test, y_pred_test, result, f"Gradien Boosting Regressor ")
#     draw_actual_vs_predicted_parity_plot(folder_path, i, y_train, y_pred_train, result_train, "train")
#     exp_best_pred, exp_worst_pred = model_explanation(folder_path, i, loaded_model, x_train, x_test,
#                                                       y_test, y_pred_test, features)
#     exp_best_pred, exp_worst_pred = explanation_worst_vs_best_predictions(loaded_model, x_train, x_test,
#                                                       y_test, y_pred_test, features)
    explanations = explanations_random_predictions(loaded_model, x_train, x_test, features)
    explanations_rank.extend(explanations)

# Test hypothesis number one from the presentation

#### For the worst predictions, are the fingerprint bits most important, which are unimportant for the best predictions?

In [57]:
best_predicted_explanations

[[('MACCSFP81 <= 0.00', 0.3109850764050915),
  ('MACCSFP46 <= 0.00', -0.2533475844552842),
  ('MACCSFP88 <= 0.00', 0.2273119187827543),
  ('MACCSFP47 <= 0.00', 0.18151226868809017),
  ('MACCSFP36 <= 0.00', 0.13228936446799408),
  ('0.00 < MACCSFP144 <= 1.00', -0.12498089440994423),
  ('MACCSFP26 <= 0.00', -0.0978423426653486),
  ('MACCSFP139 <= 0.00', -0.07857734360892456),
  ('MACCSFP50 <= 0.00', -0.06939683118255532),
  ('MACCSFP123 <= 0.00', -0.06699505259490929)],
 [('MACCSFP46 > 0.00', 0.32474759509169077),
  ('MACCSFP81 <= 0.00', 0.3046343692040811),
  ('MACCSFP88 <= 0.00', 0.2011197386008882),
  ('MACCSFP47 <= 0.00', 0.18335197821455715),
  ('MACCSFP36 <= 0.00', 0.1335817530371841),
  ('0.00 < MACCSFP144 <= 1.00', -0.12230337865281177),
  ('MACCSFP26 <= 0.00', -0.10952392305933739),
  ('MACCSFP139 > 0.00', 0.0823014569239425),
  ('MACCSFP50 <= 0.00', -0.07575086130498002),
  ('MACCSFP123 > 0.00', 0.07451946324989271)],
 [('MACCSFP81 <= 0.00', 0.2876275384179476),
  ('MACCSFP46 <

In [58]:
worst_predicted_explanations

[[('MACCSFP81 <= 0.00', 0.3149262857723902),
  ('MACCSFP88 <= 0.00', 0.20999245205193115),
  ('MACCSFP47 <= 0.00', 0.16514568438249758),
  ('MACCSFP36 <= 0.00', 0.14057161147464323),
  ('0.00 < MACCSFP144 <= 1.00', -0.1188474546232587)],
 [('MACCSFP81 <= 0.00', 0.291221963485308),
  ('MACCSFP88 <= 0.00', 0.2093948418759681),
  ('MACCSFP47 <= 0.00', 0.17901543668566383),
  ('MACCSFP36 <= 0.00', 0.1443054461500639),
  ('MACCSFP144 <= 0.00', 0.12527539881669025)],
 [('MACCSFP81 <= 0.00', 0.2922108557419052),
  ('MACCSFP88 <= 0.00', 0.2216210907850527),
  ('MACCSFP47 <= 0.00', 0.18046811831328222),
  ('MACCSFP36 <= 0.00', 0.14756041155632021),
  ('MACCSFP144 <= 0.00', 0.11782217494063618)],
 [('MACCSFP46 <= 0.00', -0.4504296024172371),
  ('MACCSFP123 <= 0.00', -0.1615975166261317),
  ('MACCSFP81 > 0.00', -0.15954932342134495),
  ('MACCSFP88 > 0.00', -0.11811424445700261),
  ('MACCSFP47 > 0.00', -0.10890441237810454)],
 [('MACCSFP46 <= 0.00', -0.48783458483037745),
  ('MACCSFP123 <= 0.00', 

In [63]:
best_predicted_most_significant = set([el[0][el[0].index('MACCSFP'):].split(' ')[0] for l in best_predicted_explanations for el in l])
best_predicted_most_significant
                                      

{'MACCSFP110',
 'MACCSFP117',
 'MACCSFP123',
 'MACCSFP139',
 'MACCSFP144',
 'MACCSFP26',
 'MACCSFP29',
 'MACCSFP30',
 'MACCSFP36',
 'MACCSFP45',
 'MACCSFP46',
 'MACCSFP47',
 'MACCSFP50',
 'MACCSFP79',
 'MACCSFP81',
 'MACCSFP86',
 'MACCSFP88',
 'MACCSFP89',
 'MACCSFP97'}

In [64]:
worst_predicted_most_significant = set([el[0][el[0].index('MACCSFP'):].split(' ')[0] for l in worst_predicted_explanations for el in l])
worst_predicted_most_significant

{'MACCSFP123',
 'MACCSFP144',
 'MACCSFP36',
 'MACCSFP46',
 'MACCSFP47',
 'MACCSFP81',
 'MACCSFP88'}

In [67]:
print(worst_predicted_most_significant.issubset(worst_predicted_most_significant))

True


In [68]:
best_predicted_most_significant_df = pd.DataFrame({
    'significant_fingerprint_bits': [*list(best_predicted_most_significant)]})

worst_predicted_most_significant_df = pd.DataFrame({
    'significant_fingerprint_bits': [*list(worst_predicted_most_significant)]})


best_predicted_most_significant_df.to_csv(f"../best_model_analysis/best_predicted_most_fingerprints.csv", index=False)
worst_predicted_most_significant_df.to_csv(f"../best_model_analysis/worst_predicted_most_significant_fingerprints.csv", index=False)


In [69]:
best_predicted_most_significant_df

Unnamed: 0,significant_fingerprint_bits
0,MACCSFP123
1,MACCSFP46
2,MACCSFP29
3,MACCSFP36
4,MACCSFP139
5,MACCSFP47
6,MACCSFP50
7,MACCSFP81
8,MACCSFP30
9,MACCSFP97


In [70]:
worst_predicted_most_significant_df

Unnamed: 0,significant_fingerprint_bits
0,MACCSFP144
1,MACCSFP47
2,MACCSFP123
3,MACCSFP46
4,MACCSFP36
5,MACCSFP88
6,MACCSFP81


# TOP 10 significant and most insignificant fingerprints

In [81]:
explanations_rank

[[('MACCSFP81 <= 0.00', 0.30705718032862983),
  ('MACCSFP46 <= 0.00', -0.24729460430321862),
  ('MACCSFP88 <= 0.00', 0.22097734407025998),
  ('MACCSFP47 <= 0.00', 0.18307964699236495),
  ('MACCSFP36 <= 0.00', 0.13798279939383054),
  ('0.00 < MACCSFP144 <= 1.00', -0.11786082160316819),
  ('MACCSFP26 <= 0.00', -0.09234369993667212),
  ('MACCSFP50 <= 0.00', -0.08810932061177966),
  ('MACCSFP139 <= 0.00', -0.07036855256682845),
  ('MACCSFP117 <= 0.00', 0.07000508161729516),
  ('MACCSFP42 <= 0.00', -0.0692074242845066),
  ('MACCSFP123 <= 0.00', -0.06640333661470492),
  ('MACCSFP75 <= 0.00', 0.05995688193601462),
  ('MACCSFP23 <= 0.00', -0.05851900799610932),
  ('MACCSFP92 <= 0.00', 0.05267218644109707),
  ('MACCSFP160 <= 0.00', 0.0498810756942134),
  ('0.00 < MACCSFP98 <= 1.00', -0.04786057320894415),
  ('MACCSFP97 <= 0.00', -0.046242965670963795),
  ('MACCSFP38 <= 0.00', 0.044537483285077305),
  ('MACCSFP76 <= 0.00', 0.04450685391008629),
  ('MACCSFP45 <= 0.00', -0.04420425068424489),
  ('

In [84]:
explanations_rank_only_bits_from_local_top_10 = [el[0][el[0].index('MACCSFP'):].split(' ')[0] for l in explanations_rank for el in l[:10]]
explanations_rank_only_bits_from_local_top_10

['MACCSFP81',
 'MACCSFP46',
 'MACCSFP88',
 'MACCSFP47',
 'MACCSFP36',
 'MACCSFP144',
 'MACCSFP26',
 'MACCSFP50',
 'MACCSFP139',
 'MACCSFP117',
 'MACCSFP81',
 'MACCSFP46',
 'MACCSFP88',
 'MACCSFP47',
 'MACCSFP36',
 'MACCSFP144',
 'MACCSFP26',
 'MACCSFP50',
 'MACCSFP123',
 'MACCSFP139',
 'MACCSFP81',
 'MACCSFP46',
 'MACCSFP88',
 'MACCSFP47',
 'MACCSFP36',
 'MACCSFP144',
 'MACCSFP26',
 'MACCSFP50',
 'MACCSFP117',
 'MACCSFP139',
 'MACCSFP81',
 'MACCSFP46',
 'MACCSFP88',
 'MACCSFP47',
 'MACCSFP36',
 'MACCSFP144',
 'MACCSFP26',
 'MACCSFP50',
 'MACCSFP123',
 'MACCSFP139',
 'MACCSFP81',
 'MACCSFP46',
 'MACCSFP88',
 'MACCSFP47',
 'MACCSFP36',
 'MACCSFP144',
 'MACCSFP26',
 'MACCSFP139',
 'MACCSFP117',
 'MACCSFP50',
 'MACCSFP46',
 'MACCSFP123',
 'MACCSFP81',
 'MACCSFP50',
 'MACCSFP88',
 'MACCSFP47',
 'MACCSFP36',
 'MACCSFP144',
 'MACCSFP26',
 'MACCSFP116',
 'MACCSFP46',
 'MACCSFP123',
 'MACCSFP81',
 'MACCSFP50',
 'MACCSFP88',
 'MACCSFP47',
 'MACCSFP36',
 'MACCSFP144',
 'MACCSFP45',
 'MACCSFP26',


In [89]:
from collections import Counter

most_significant_dict = Counter(explanations_rank_only_bits_from_local_top_10)
most_significant_dict

Counter({'MACCSFP81': 25,
         'MACCSFP46': 25,
         'MACCSFP88': 25,
         'MACCSFP47': 25,
         'MACCSFP36': 25,
         'MACCSFP144': 15,
         'MACCSFP26': 9,
         'MACCSFP50': 18,
         'MACCSFP139': 21,
         'MACCSFP117': 17,
         'MACCSFP123': 15,
         'MACCSFP116': 2,
         'MACCSFP45': 8,
         'MACCSFP110': 6,
         'MACCSFP79': 5,
         'MACCSFP86': 3,
         'MACCSFP97': 1,
         'MACCSFP89': 3,
         'MACCSFP29': 1,
         'MACCSFP30': 1})

In [91]:
top10_most_significant = [key for key, value in most_significant_dict.items() if value in sorted(list(most_significant_dict.values()))[::-1][:10]]
top10_most_significant

['MACCSFP81',
 'MACCSFP46',
 'MACCSFP88',
 'MACCSFP47',
 'MACCSFP36',
 'MACCSFP144',
 'MACCSFP50',
 'MACCSFP139',
 'MACCSFP117',
 'MACCSFP123']

# Insignificant fingerprints

In [94]:
all_bits_with_zero_value = [el[0][el[0].index('MACCSFP'):].split(' ')[0] for l in explanations_rank for el in l if el[1] == 0.0]
all_bits_with_zero_value

['MACCSFP133',
 'MACCSFP129',
 'MACCSFP137',
 'MACCSFP156',
 'MACCSFP158',
 'MACCSFP161',
 'MACCSFP138',
 'MACCSFP142',
 'MACCSFP150',
 'MACCSFP151',
 'MACCSFP153',
 'MACCSFP145',
 'MACCSFP100',
 'MACCSFP111',
 'MACCSFP121',
 'MACCSFP34',
 'MACCSFP162',
 'MACCSFP163',
 'MACCSFP164',
 'MACCSFP165',
 'MACCSFP100',
 'MACCSFP111',
 'MACCSFP137',
 'MACCSFP138',
 'MACCSFP129',
 'MACCSFP150',
 'MACCSFP151',
 'MACCSFP153',
 'MACCSFP158',
 'MACCSFP34',
 'MACCSFP121',
 'MACCSFP145',
 'MACCSFP156',
 'MACCSFP161',
 'MACCSFP133',
 'MACCSFP142',
 'MACCSFP162',
 'MACCSFP163',
 'MACCSFP164',
 'MACCSFP165',
 'MACCSFP142',
 'MACCSFP34',
 'MACCSFP111',
 'MACCSFP133',
 'MACCSFP129',
 'MACCSFP100',
 'MACCSFP150',
 'MACCSFP151',
 'MACCSFP153',
 'MACCSFP156',
 'MACCSFP158',
 'MACCSFP145',
 'MACCSFP161',
 'MACCSFP121',
 'MACCSFP137',
 'MACCSFP138',
 'MACCSFP163',
 'MACCSFP165',
 'MACCSFP162',
 'MACCSFP164',
 'MACCSFP137',
 'MACCSFP34',
 'MACCSFP100',
 'MACCSFP121',
 'MACCSFP138',
 'MACCSFP142',
 'MACCSFP111',

In [95]:
all_bits_with_zero_value_dict = Counter(all_bits_with_zero_value)
all_bits_with_zero_value_dict

Counter({'MACCSFP133': 25,
         'MACCSFP129': 25,
         'MACCSFP137': 25,
         'MACCSFP156': 25,
         'MACCSFP158': 25,
         'MACCSFP161': 25,
         'MACCSFP138': 20,
         'MACCSFP142': 25,
         'MACCSFP150': 15,
         'MACCSFP151': 25,
         'MACCSFP153': 25,
         'MACCSFP145': 25,
         'MACCSFP100': 25,
         'MACCSFP111': 20,
         'MACCSFP121': 25,
         'MACCSFP34': 5,
         'MACCSFP162': 25,
         'MACCSFP163': 25,
         'MACCSFP164': 25,
         'MACCSFP165': 25,
         'MACCSFP104': 5,
         'MACCSFP52': 5,
         'MACCSFP66': 5,
         'MACCSFP29': 4,
         'MACCSFP30': 4})

In [99]:
top10_most_insignificant = [key for key, value in all_bits_with_zero_value_dict.items() if value in sorted(list(all_bits_with_zero_value_dict.values()))[::-1][:10]][:10]
top10_most_insignificant

['MACCSFP133',
 'MACCSFP129',
 'MACCSFP137',
 'MACCSFP156',
 'MACCSFP158',
 'MACCSFP161',
 'MACCSFP142',
 'MACCSFP151',
 'MACCSFP153',
 'MACCSFP145']

In [101]:
results = pd.DataFrame({
    'top 10 significant_fingerprint_bits': [*list(top10_most_significant)],
   'top 10 insignificant_fingerprint_bits': [*list(top10_most_insignificant)]})


results.to_csv(f"../best_model_analysis/significant_and_insignificant_fingerprints.csv", index=False)
results

Unnamed: 0,top 10 significant_fingerprint_bits,top 10 insignificant_fingerprint_bits
0,MACCSFP81,MACCSFP133
1,MACCSFP46,MACCSFP129
2,MACCSFP88,MACCSFP137
3,MACCSFP47,MACCSFP156
4,MACCSFP36,MACCSFP158
5,MACCSFP144,MACCSFP161
6,MACCSFP50,MACCSFP142
7,MACCSFP139,MACCSFP151
8,MACCSFP117,MACCSFP153
9,MACCSFP123,MACCSFP145
