In [None]:
import os
import sys
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm

sys.path.append('../..')
from data.constants import BASE_PATH_EXPERIMENTS

plt.rcParams.update(
    {
        "pdf.fonttype": 42,
        "font.family": "sans-serif",
        "font.sans-serif": "Arial",
        "font.size": 10,
    }
)

In [None]:
base_path = os.path.join(BASE_PATH_EXPERIMENTS, 'comparable_score_ranges/performance_plots')
base_path = Path(base_path)

In [None]:
dfs = []
for path in base_path.rglob('performances_*.csv'):
    if 'old' in str(path):
        continue
    df = pd.read_csv(path)
    df.drop(columns=['Unnamed: 0'], inplace=True)
    df.loc[:,'Task difficulty'] = 'Easy task' if 'b_mono_nk' in str(path) else 'Hard task'
    df.loc[:,'Overlapping Signatures'] = 'non-overlapping' if 'non_over' in str(path) else 'overlapping'

    df.loc[:,'Scale imbalance (F1-score weighted)'] = df['Hard labeling on scaled scores (F1-score weighted)'] - df['Hard labeling on scores (F1-score weighted)']
    df.loc[:,'Scale imbalance (Balanced accuracy)'] = df['Hard labeling on scaled scores (Balanced accuracy)'] - df['Hard labeling on scores (Balanced accuracy)']
    df.loc[:,'Scale imbalance (Jaccard-score weighted)'] = df['Hard labeling on scaled scores (Jaccard-score weighted)'] - df['Hard labeling on scores (Jaccard-score weighted)']
    df.drop(columns=['Hard labeling on scaled scores (F1-score weighted)',
                     'Hard labeling on scaled scores (Balanced accuracy)',
                     'Hard labeling on scaled scores (Jaccard-score weighted)',
                     ], inplace=True)

    dfs.append(df)

df = (pd.concat(dfs, axis=0)).reset_index(drop=True)

df = pd.melt(df,
        id_vars=['Scoring method', 'Task difficulty', 'Overlapping Signatures'],
        var_name='Performance metric description long',
        value_name='score',
        )
df

In [None]:
df.loc[:,'Performance metric'] = df.loc[:,'Performance metric description long'].apply(lambda x: 'F1-score (weighted)' if 'F1-score' in x else ('Balanced accuracy' if 'Balanced accuracy' in x else 'Jaccard-score (weighted)'))
df.loc[:,'Performance metric description'] = df['Performance metric description long'].str.replace(r' \(.*\)','',regex=True)

In [None]:
order_metrics = ['Information quantity', 'Hard labeling on scores', 'Scale imbalance', 'Rediscovery score']

In [None]:
# for group_name, data in df.groupby('Overlapping Signatures'):
#     curr_pivoted_table = data.pivot_table(index=['Performance metric', 'Scoring method'],
#                                           columns=['Task difficulty', 'Performance metric description'])
#     curr_pivoted_table.to_csv(base_path/f'pivot_table_{group_name}.csv')

In [None]:
main_figure = [('Hard labeling on scores (F1-score weighted)', 'Easy task', 'non-overlapping', False),
               ('Hard labeling on scores (F1-score weighted)', 'Hard task', 'non-overlapping', False),
               ('Information quantity (cross-validated F1-score)', 'Hard task', 'non-overlapping', False), 
               ('Scale imbalance (F1-score weighted)', 'Hard task', 'non-overlapping', True)]

suppl_figure_0 = [('Hard labeling on scores (F1-score weighted)', 'Easy task', 'overlapping', False),
                  ('Hard labeling on scores (F1-score weighted)', 'Hard task', 'overlapping', False),
                  ('Information quantity (cross-validated F1-score)', 'Hard task', 'overlapping', False), 
                  ('Scale imbalance (F1-score weighted)', 'Hard task', 'overlapping', True)]

suppl_figure_1 = [('Information quantity (cross-validated F1-score)', 'Easy task', 'non-overlapping', False), 
                  ('Scale imbalance (F1-score weighted)', 'Easy task', 'non-overlapping', True),
                  ('Rediscovery score (F1-score weighted for unsupervised clustering)', 'Easy task', 'non-overlapping', False)]

suppl_figure_2 = [('Rediscovery score (F1-score weighted for unsupervised clustering)', 'Hard task', 'non-overlapping', False)]

configs = [('main_figure', main_figure), ('suppl_figure_0',suppl_figure_0), ('suppl_figure_1',suppl_figure_1), ('suppl_figure_2', suppl_figure_2)]

In [None]:
sc_method_list = ['ANS', 'Seurat', 'Seurat_AG', 'Seurat_LVG', 'Scanpy', 'Jasmine_LH', 'Jasmine_OR', 'UCell', 'ph1', 'ph2']
# Number of colors you want from the "tab10" colormap
num_colors = 10

# Get the "tab10" colormap
tab10 = cm.get_cmap('tab10', num_colors)

# Create a list of colors
colors = {method: tab10(i) for i, method in enumerate(sc_method_list)}

In [None]:
cm = 1 / 2.54  # centimeters in inches

def _create_one_figure(df, metric, col_order, first_scores=True):
    ncols = len(col_order)
    f, ax = plt.subplots(nrows=1, ncols=ncols, figsize=(ncols*5.5*cm, 7*cm),sharey=False)
    for i, setting in enumerate(col_order):
        curr_ax = ax[i] if ncols>1 else ax

        curr_sort_order = setting[-1]
        curr_df = df[(df['Performance metric description long']==setting[0]) & \
                     (df['Task difficulty']==setting[1]) & \
                     (df['Overlapping Signatures']==setting[2]) ]

        sc_method_order = (curr_df.sort_values(by='score', ascending=curr_sort_order))['Scoring method'].tolist()
    
        sns.barplot(data=curr_df, x="Scoring method", y="score", ax=curr_ax, order=sc_method_order,
                   palette=[colors[method] for method in sc_method_order]
                   )
        title = f"{curr_df['Performance metric description'].unique()[0]} ({setting[1].split(' ')[0].lower()})"
        curr_ax.set_title(title, fontsize=10)
        curr_ax.tick_params(axis='x', labelsize=10, rotation=90)
        curr_ax.tick_params(axis='y', labelsize=8)
        curr_ax.set_xlabel('')
        
        ymin = 0.85 if i==0 and first_scores else min(0, curr_df['score'].min())
        ymax = 0.1 if curr_df['score'].max()<0.1 else 1
        curr_ax.set_ylim(ymin, ymax)
                
        if i==0:
            curr_ax.set_ylabel(metric,fontsize=10)
        else:
            curr_ax.set_ylabel('')
        for p in curr_ax.patches:
            x = p.get_x() + p.get_width() / 2.
            y = p.get_height()
            y1 = p.get_height() if y<0.4 else ((p.get_height()-0.045) if i==0 and first_scores else (p.get_height()-0.3))
            curr_ax.annotate(f'{y:.2f}', (x, y1), ha='center', fontsize=8, color='black', rotation=90, xytext=(0, 5), textcoords='offset points')

    f.tight_layout()
    
    return f

In [None]:
metric = 'F1 score (weighted)'
for (config_name, curr_config) in configs:
    fig = _create_one_figure(df, metric, curr_config, first_scores=False if 'suppl_figure_2' == config_name else True)
    fig.savefig(base_path/f'{config_name}.pdf')
    plt.show(fig)