In [54]:
import ast
import math
import pandas as pd
import numpy as np
from sympy.printing.pretty.pretty_symbology import line_width

from Independent_code.Blosum import blosum_45

In [55]:
results_file = "Independent_code/Results/results_all.csv"
with open(results_file) as f:
    lines = f.readlines()

results = [(line.split("\t")[0], ast.literal_eval(line.split("\t")[1][:-1])) for line in lines]
results_R2 = [(result[0], [r2 for r2, rmse in result[1]]) for result in results]
results_RMSE = [(result[0], [rmse for r2, rmse in result[1]]) for result in results]


In [56]:
results_R2_sorted = sorted(results_R2, key=lambda x: sum(x[1]) / len(x[1]), reverse=True)
results_RMSE_sorted = sorted(results_RMSE, key=lambda x: sum(x[1]) / len(x[1]), reverse=False)

print(results_R2_sorted[0])

('blosum80_rf ', [0.818, 0.756, 0.591, 0.595, 0.777, 0.586, 0.698, 0.716, 0.793, 0.796, 0.753, 0.803, 0.839, 0.638, 0.655, 0.706, 0.694, 0.871, 0.77, 0.807, 0.731, 0.668, 0.777, 0.651, 0.776, 0.806, 0.749, 0.804, 0.633, 0.695, 0.837, 0.665, 0.687, 0.817, 0.743, 0.77, 0.746, 0.641, 0.619, 0.689, 0.649, 0.651, 0.675, 0.649, 0.768, 0.703, 0.709, 0.807, 0.806, 0.75])


In [57]:
# plot R2 Scores with plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots

plot_height = 1300
plot_width = 3000

colors = {"one_hot": 'rgba(93, 164, 214, 0.5)',
          "blosum45": 'rgba(255, 144, 14, 0.5)',
          "blosum50": 'rgba(44, 160, 101, 0.5)',
          "blosum62": 'rgba(255, 65, 54, 0.5)',
          "blosum80": 'rgba(207, 114, 255, 0.5)',
          "blosum90": 'rgba(127, 96, 0, 0.5)',
          "esm2": 'rgba(255, 179, 186, 0.5)',
          "esm1b": 'rgba(161, 237, 161, 0.5)',
          "georgiev": 'rgba(255, 221, 51, 0.5)'}
encodings = ["one_hot", "blosum45", "blosum50", "blosum62", "blosum80", "blosum90", "esm1b", "esm2", "georgiev"]
models = ["svr", "rf", "adaboost", "gboost", "xgboost", "xgboost_rf"]
# fig = make_subplots(rows=len(rows_e), cols=len(columns_m))



In [58]:
fig_RMSE = make_subplots(rows=1, cols=1)

for result in results_RMSE_sorted:
    name = result[0].split("_")
    if name[0] == "one":
        encoding = '_'.join(name[:2])

        if len(name[2:]) > 1:
            model = "_".join(name[2:])[:-1]
        else:
            model = "".join(name[2:])[:-1]
    else:
        model = ("".join(name[1:]))[:-1]
        encoding = ('_'.join(name[:1]))

    values = result[1]
    # for i, encoding in enumerate(encodings):
    fig_RMSE.add_trace(
        go.Box(y=values, name="_".join([model, encoding]), boxmean="sd",
               marker_color=colors[encoding]))

fig_RMSE.update_yaxes(autorange="reversed")
fig_RMSE.update_layout(
    title_text="Summary: RMSE-Performance for Validationsets of each 50 Model instances \n Predicting Activity for Proteus' NOD Dataset",
    title_font=dict(color="black", size=30),
    showlegend=True,
    legend=dict(font=dict(color="black")),
    height=plot_height,
    width=plot_width,
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
    xaxis=dict(color='black',
               showgrid=True,
               gridcolor='lightgrey',
               griddash="dot",
               gridwidth=1
               ),
    yaxis=dict(color='black',
               showgrid=True,
               gridcolor='grey',
               griddash="dash",
               gridwidth=0.5
               )
)

In [59]:
fig_R2 = make_subplots(rows=1, cols=1)

for result in results_R2_sorted:
    combination = result[0]
    name = result[0].split("_")
    if name[0] == "one":
        encoding = '_'.join(name[:2])

        if len(name[2:]) > 1:
            model = "_".join(name[2:])[:-1]
        else:
            model = "".join(name[2:])[:-1]
    else:
        model = ("_".join(name[1:]))[:-1]
        encoding = ('_'.join(name[:1]))

    values = result[1]
    # for i, encoding in enumerate(encodings):
    fig_R2.add_trace(
        go.Box(y=values, name="_".join([model, encoding]), boxmean="sd",
               marker_color=colors[encoding]))
fig_R2.update_layout(
    title_text="Summary: R2-Performance for Validationsets of each 50 Model instances \n Predicting Activity for Proteus' NOD Dataset",
    title_font=dict(color="black", size=30),
    showlegend=True,
    legend=dict(font=dict(color="black")),
    height=plot_height,
    width=plot_width,
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
    xaxis=dict(color='black',
               showgrid=True,
               gridcolor='lightgrey',
               griddash="dot",
               gridwidth=1
               ),
    yaxis=dict(color='black',
               showgrid=True,
               gridcolor='grey',
               griddash="dash",
               gridwidth=0.5
               )
)



In [60]:
import kaleido

fig_R2.write_image("boxplot_Summary_R2.jpg")
fig_RMSE.write_image("boxplot_Summary_RMSE.jpg")

In [61]:
"""Create a Dataframe for each Performance-Metric"""

columns_m = set()
rows_e = set()

for result in results:
    name = result[0].split("_")
    if name[0] == "one":
        encoding = '_'.join(name[:2])

        if len(name[2:]) > 1:
            model = "_".join(name[2:])[:-1]
        else:
            model = "".join(name[2:])[:-1]
    else:
        model = ("_".join(name[1:]))[:-1]
        encoding = ('_'.join(name[:1]))

    rows_e.add(encoding)
    columns_m.add(model)

columns_m = list(columns_m)
rows_e = list(rows_e)

r2_df = pd.DataFrame(columns=columns_m, index=rows_e)
rmse_df = pd.DataFrame(columns=columns_m, index=rows_e)
for result in results:
    r2 = [a for a, b in result[1]]
    rmse = [b for a, b in result[1]]

    name = result[0].split("_")
    if name[0] == "one":
        encoding = '_'.join(name[:2])

        if len(name[2:]) > 1:
            model = "_".join(name[2:])[:-1]
        else:
            model = "".join(name[2:])[:-1]
    else:
        model = ("_".join(name[1:]))[:-1]
        encoding = ('_'.join(name[:1]))

    r2_df.loc[encoding][model] = r2
    rmse_df.loc[encoding][model] = rmse


ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the defaul

In [62]:
"""
Creating Plots for ModelPerformance
"""



encodings = ["one_hot", "blosum45", "blosum50", "blosum62", "blosum80", "blosum90", "esm1b", "esm2", "georgiev"]
models = ["svr", "rf", "adaboost", "gboost", "xgboost", "xgboost_rf"]

R2_per_model = dict()
RMSE_per_model = dict()

for model in models:
    _results_R2 = []
    _results_RMSE = []
    for encoding in encodings:
        _result_R2 = r2_df.loc[encoding][model]
        _result_RMSE = rmse_df.loc[encoding][model]

        for i in range(len(_result_R2)):
            _results_R2.append(_result_R2[i])
            _results_RMSE.append(_result_RMSE[i])

    R2_per_model.update({model: _results_R2})
    RMSE_per_model.update({model: _results_RMSE})

av_R2_per_model = dict()
sd_R2_per_model = dict()
av_RMSE_per_model = dict()
sd_RMSE_per_model = dict()

for key in R2_per_model.keys():
    av_R2_per_model.update({key: round(sum(R2_per_model[key]) / len(R2_per_model[key]), 3)})
    av_RMSE_per_model.update({key: round(sum(RMSE_per_model[key]) / len(RMSE_per_model[key]), 3)})
    sd_R2_per_model.update({key: round(float(np.std(R2_per_model[key])), 3)})
    sd_RMSE_per_model.update({key: round(float(np.std(RMSE_per_model[key])), 3)})

print(av_RMSE_per_model)
print(av_R2_per_model)

av_R2_per_model = dict(sorted(av_R2_per_model.items(), key=lambda x: x[1], reverse=True))
av_RMSE_per_model = dict(sorted(av_RMSE_per_model.items(), key=lambda x: x[1], reverse=False))

print(av_RMSE_per_model)
print(av_R2_per_model)

sd_R2_per_model = {key: sd_R2_per_model[key] for key in av_R2_per_model.keys()}
sd_RMSE_per_model = {key: sd_RMSE_per_model[key] for key in av_R2_per_model.keys()}

print(sd_RMSE_per_model)
print(sd_R2_per_model)


{'svr': 0.253, 'rf': 0.204, 'adaboost': 0.234, 'gboost': 0.212, 'xgboost': 0.215, 'xgboost_rf': 0.207}
{'svr': 0.526, 'rf': 0.697, 'adaboost': 0.599, 'gboost': 0.674, 'xgboost': 0.657, 'xgboost_rf': 0.684}
{'rf': 0.204, 'xgboost_rf': 0.207, 'gboost': 0.212, 'xgboost': 0.215, 'adaboost': 0.234, 'svr': 0.253}
{'rf': 0.697, 'xgboost_rf': 0.684, 'gboost': 0.674, 'xgboost': 0.657, 'adaboost': 0.599, 'svr': 0.526}
{'rf': 0.026, 'xgboost_rf': 0.025, 'gboost': 0.026, 'xgboost': 0.025, 'adaboost': 0.026, 'svr': 0.048}
{'rf': 0.082, 'xgboost_rf': 0.083, 'gboost': 0.083, 'xgboost': 0.096, 'adaboost': 0.092, 'svr': 0.174}


In [63]:
plot_height = 750
plot_width = 1500

colors = ['rgba(102, 178, 255, 0.5)',
          'rgba(255, 123, 85, 0.5)',
          'rgba(48, 103, 255, 0.5)',
          'rgba(250, 50, 132, 0.5)',
          'rgba(165, 105, 188, 0.5)',
          'rgba(255, 190, 58, 0.5)']

models_ranked = make_subplots(subplot_titles=("R2 Performance", "RMSE Performance"), rows=1, cols=2)
for i, model in enumerate(av_R2_per_model.keys()):
    models_ranked.add_trace(
        go.Bar(x=[model], y=[av_R2_per_model[model]], marker_color=colors[i], name=model, width=0.5), row=1, col=1)
    models_ranked.add_trace(
        go.Bar(x=[model], y=[av_RMSE_per_model[model]], marker_color=colors[i], name=model, width=0.5),
        row=1, col=2)

# fig_RMSE.update_yaxes(autorange="reversed")
models_ranked.update_annotations(font=dict(color="black"))
models_ranked.update_layout(
    title_text="Averaged Performance per Model-Type",
    title_font=dict(color="black", size=30),
    showlegend=False,
    legend=dict(font=dict(color="black")),
    height=plot_height,
    width=plot_width,

    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)'
)
models_ranked.update_yaxes(range=[0.4, 0.827], row=1, col=1)
models_ranked.update_yaxes(range=[0.15, 0.31], row=1, col=2)



models_ranked.update_yaxes(dict(color='black',
                                showgrid=True,
                                gridcolor='grey',
                                griddash="dot",
                                minor_griddash="dot",
                                gridwidth=0.5
                                )
                           )

models_ranked.update_xaxes(dict(color='black',
                                showgrid=False,
                                gridcolor='lightgrey',
                                griddash="dot",
                                minor_griddash="dot",
                                showline=True,
                                zerolinecolor='black',
                                linewidth=2,
                                gridwidth=1
                                ),
                           )

In [64]:
models_ranked.write_image("Barplot_ModelRanking.jpg")

In [65]:
plot_height = 750
plot_width = 1500

colors = ['rgba(102, 178, 255, 0.5)',
          'rgba(255, 123, 85, 0.5)',
          'rgba(48, 103, 255, 0.5)',
          'rgba(250, 50, 132, 0.5)',
          'rgba(165, 105, 188, 0.5)',
          'rgba(255, 190, 58, 0.5)']

models_sd = make_subplots(subplot_titles=("SD for R2", "SD for RMSE"), rows=1, cols=2)
for i, model in enumerate(sd_R2_per_model.keys()):
    models_sd.add_trace(
        go.Bar(x=[model], y=[sd_R2_per_model[model]], marker_color=colors[i], name=model, width=0.5), row=1, col=1)
    models_sd.add_trace(
        go.Bar(x=[model], y=[sd_RMSE_per_model[model]], marker_color=colors[i], name=model, width=0.5),
        row=1, col=2)

# fig_RMSE.update_yaxes(autorange="reversed")
models_sd.update_annotations(font=dict(color="black"))
models_sd.update_layout(
    title_text="Standard Deviation (SD) per Model-Type, ordered by Average Performance",
    title_font=dict(color="black", size=30),
    showlegend=False,
    legend=dict(font=dict(color="black")),
    height=plot_height,
    width=plot_width,

    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)'
)
# models_ranked.update_yaxes(range=[0.4, 0.827], row=1, col=1)
# models_ranked.update_yaxes(range=[0.15, 0.31], row=1, col=2)



models_sd.update_yaxes(dict(color='black',
                                showgrid=True,
                                gridcolor='grey',
                                griddash="dot",
                                minor_griddash="dot",
                                gridwidth=0.5
                                )
                           )

models_sd.update_xaxes(dict(color='black',
                                showgrid=False,
                                gridcolor='lightgrey',
                                griddash="dot",
                                minor_griddash="dot",
                                showline=True,
                                zerolinecolor='black',
                                linewidth=2,
                                gridwidth=1
                                ),
                           )

In [66]:
models_sd.write_image("Barplot_ModelSD.jpg")

In [67]:
"""
Creating Plots for EncodingPerformance
"""


encodings = ["one_hot", "blosum45", "blosum50", "blosum62", "blosum80", "blosum90", "esm1b", "esm2", "georgiev"]
models = ["svr", "rf", "adaboost", "gboost", "xgboost", "xgboost_rf"]

R2_per_encoding = dict()
RMSE_per_encoding = dict()

for encoding in encodings:
    _results_R2 = []
    _results_RMSE = []
    """Removed SVR, as it performs extremely bad with ESM (1b/2) Encodings"""
    for model in models[1:]:
        _result_R2 = r2_df.loc[encoding][model]
        _result_RMSE = rmse_df.loc[encoding][model]

        for i in range(len(_result_R2)):
            _results_R2.append(_result_R2[i])
            _results_RMSE.append(_result_RMSE[i])

    R2_per_encoding.update({encoding: _results_R2})
    RMSE_per_encoding.update({encoding: _results_RMSE})

av_R2_per_encoding = dict()
sd_R2_per_encoding = dict()
av_RMSE_per_encoding = dict()
sd_RMSE_per_encoding = dict()

for key in R2_per_encoding.keys():
    av_R2_per_encoding.update({key: round(sum(R2_per_encoding[key]) / len(R2_per_encoding[key]), 3)})
    av_RMSE_per_encoding.update({key: round(sum(RMSE_per_encoding[key]) / len(RMSE_per_encoding[key]), 3)})
    sd_R2_per_encoding.update({key: round(float(np.std(R2_per_encoding[key])), 3)})
    sd_RMSE_per_encoding.update({key: round(float(np.std(RMSE_per_encoding[key])), 3)})

av_R2_per_encoding = dict(sorted(av_R2_per_encoding.items(), key=lambda x: x[1], reverse=True))
av_RMSE_per_encoding = dict(sorted(av_RMSE_per_encoding.items(), key=lambda x: x[1], reverse=False))


sd_R2_per_encoding = {key: sd_R2_per_encoding[key] for key in av_R2_per_encoding.keys()}
sd_RMSE_per_encoding = {key: sd_RMSE_per_encoding[key] for key in av_R2_per_encoding.keys()}

print(av_RMSE_per_encoding)
print(av_R2_per_encoding)


{'blosum45': 0.21, 'blosum80': 0.211, 'esm1b': 0.213, 'esm2': 0.213, 'blosum50': 0.214, 'blosum90': 0.214, 'blosum62': 0.216, 'georgiev': 0.216, 'one_hot': 0.222}
{'blosum45': 0.674, 'esm1b': 0.673, 'blosum80': 0.671, 'blosum90': 0.663, 'esm2': 0.663, 'blosum62': 0.662, 'blosum50': 0.66, 'georgiev': 0.652, 'one_hot': 0.639}


In [68]:
plot_height = 750
plot_width = 1500

colors = {"one_hot": 'rgba(93, 164, 214, 0.5)',
          "blosum45": 'rgba(255, 144, 14, 0.5)',
          "blosum50": 'rgba(44, 160, 101, 0.5)',
          "blosum62": 'rgba(255, 65, 54, 0.5)',
          "blosum80": 'rgba(207, 114, 255, 0.5)',
          "blosum90": 'rgba(127, 96, 0, 0.5)',
          "esm2": 'rgba(255, 179, 186, 0.5)',
          "esm1b": 'rgba(161, 237, 161, 0.5)',
          "georgiev": 'rgba(255, 221, 51, 0.5)'}

encodings_ranked = make_subplots(subplot_titles=("R2 Performance", "RMSE Performance"), rows=1, cols=2)
for i, encoding in enumerate(av_R2_per_encoding.keys()):
    encodings_ranked.add_trace(
        go.Bar(x=[encoding], y=[av_R2_per_encoding[encoding]], marker_color=colors[encoding], name=encoding, width=0.5), row=1, col=1)
    encodings_ranked.add_trace(
        go.Bar(x=[encoding], y=[av_RMSE_per_encoding[encoding]], marker_color=colors[encoding], name=encoding, width=0.5),
        row=1, col=2)

encodings_ranked.update_annotations(font=dict(color="black"))
encodings_ranked.update_layout(
    title_text="Averaged Performance per Encoding-Type - excluding SVR",
    title_font=dict(color="black", size=30),
    showlegend=False,
    legend=dict(font=dict(color="black")),
    height=plot_height,
    width=plot_width,

    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)'
)
encodings_ranked.update_yaxes(range=[0.4, 0.827], row=1, col=1)
encodings_ranked.update_yaxes(range=[0.15, 0.31], row=1, col=2)



encodings_ranked.update_yaxes(dict(color='black',
                                showgrid=True,
                                gridcolor='grey',
                                griddash="dot",
                                minor_griddash="dot",
                                gridwidth=0.5
                                )
                           )

encodings_ranked.update_xaxes(dict(color='black',
                                showgrid=False,
                                gridcolor='lightgrey',
                                griddash="dot",
                                minor_griddash="dot",
                                showline=True,
                                zerolinecolor='black',
                                linewidth=2,
                                gridwidth=1
                                ),
                           )

In [69]:
encodings_ranked.write_image("Barplot_EncodingRanking_noSVR.jpg")

In [70]:
plot_height = 750
plot_width = 1500

colors = {"one_hot": 'rgba(93, 164, 214, 0.5)',
          "blosum45": 'rgba(255, 144, 14, 0.5)',
          "blosum50": 'rgba(44, 160, 101, 0.5)',
          "blosum62": 'rgba(255, 65, 54, 0.5)',
          "blosum80": 'rgba(207, 114, 255, 0.5)',
          "blosum90": 'rgba(127, 96, 0, 0.5)',
          "esm2": 'rgba(255, 179, 186, 0.5)',
          "esm1b": 'rgba(161, 237, 161, 0.5)',
          "georgiev": 'rgba(255, 221, 51, 0.5)'}

encodings_sd = make_subplots(subplot_titles=("SD for R2", "SD for RMSE"), rows=1, cols=2)
for i, encoding in enumerate(sd_R2_per_encoding.keys()):
    encodings_sd.add_trace(
        go.Bar(x=[encoding], y=[sd_R2_per_encoding[encoding]], marker_color=colors[encoding], name=encoding, width=0.5), row=1, col=1)
    encodings_sd.add_trace(
        go.Bar(x=[encoding], y=[sd_RMSE_per_encoding[encoding]], marker_color=colors[encoding], name=encoding, width=0.5),
        row=1, col=2)

# fig_RMSE.update_yaxes(autorange="reversed")
encodings_sd.update_annotations(font=dict(color="black"))
encodings_sd.update_layout(
    title_text="Standard Deviation (SD) per Encoding-Type, ordered by Average Performance - excluding SVR",
    title_font=dict(color="black", size=30),
    showlegend=False,
    legend=dict(font=dict(color="black")),
    height=plot_height,
    width=plot_width,

    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)'
)
encodings_sd.update_yaxes(range=[0.05, 0.15], row=1, col=1)
encodings_sd.update_yaxes(range=[0.02, 0.04], row=1, col=2)



encodings_sd.update_yaxes(dict(color='black',
                                showgrid=True,
                                gridcolor='grey',
                                griddash="dot",
                                minor_griddash="dot",
                                gridwidth=0.5
                                )
                           )

encodings_sd.update_xaxes(dict(color='black',
                                showgrid=False,
                                gridcolor='lightgrey',
                                griddash="dot",
                                minor_griddash="dot",
                                showline=True,
                                zerolinecolor='black',
                                linewidth=2,
                                gridwidth=1
                                ),
                           )

In [71]:
encodings_sd.write_image("Barplot_EncodingSD_noSVR.jpg")