Preparation and plot functions

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['text.usetex'] = True
from itertools import cycle
from numpy import trapz

COLORS = ['b', 'g', 'r', 'c', 'm', 'y', 'k']


def plot_pr(line_dfs, labels, title, filename):
    lw = 2
    plt.figure()
    for i, df in enumerate(line_dfs):
        #auc = trapz(df['precision'], df['recall'])
        plt.plot(df['recall'], df['precision'],
                 label=labels[i],# + ', AUC = ' + str(round(auc, 3)),
                 color=COLORS[i], linewidth=lw)

    #plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    ppn = round(line_dfs[0]['precision'][len(line_dfs[0]['precision'])-1], 3)
    plt.axhline(y=ppn, color='gray', linestyle='--', label='Random Classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.grid()
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(title)
    plt.legend(loc="upper right")
    plt.savefig(filename, dpi=480)
    plt.show()


def plot_roc(line_dfs, labels, title, filename):
    lw = 2
    plt.figure()
    for i, df in enumerate(line_dfs):
        auc = trapz(df['recall'], df['fpr'])
        plt.plot(df['fpr'], df['recall'],
                 label=labels[i]+', AUC = '+str(round(auc, 3)),
                 color=COLORS[i], linewidth=lw)

    plt.plot([0, 1], [0, 1], 'k--', lw=lw, label='Random Classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.grid()
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="lower right", prop={'size': 6})
    plt.savefig(filename, dpi=480)
    plt.show()
    
print('SUCCESS')

Plot RA dataset evals

In [None]:
DIR = '/Users/wang/Documents/git/lets_learn/infclean/data/tsv/eval/pdb_emb/rent_curves/'
EMB_PREFIX = DIR + 'emb_'
PLAIN_PREFIX = DIR + 'plain_'
NR_PREFIX = 'NR_'
PR_SUFFIX = 'pr_curve.csv'
ROC_SUFFIX = 'roc_curve.csv'

emb_pr_df = pd.read_csv(EMB_PREFIX+PR_SUFFIX)
emb_roc_df = pd.read_csv(EMB_PREFIX+ROC_SUFFIX)
plain_pr_df = pd.read_csv(PLAIN_PREFIX+PR_SUFFIX)
plain_roc_df = pd.read_csv(PLAIN_PREFIX+ROC_SUFFIX)
emb_nr_pr_df = pd.read_csv(EMB_PREFIX+NR_PREFIX+PR_SUFFIX)
emb_nr_roc_df = pd.read_csv(EMB_PREFIX+NR_PREFIX+ROC_SUFFIX)
plain_nr_pr_df = pd.read_csv(PLAIN_PREFIX+NR_PREFIX+PR_SUFFIX)
plain_nr_roc_df = pd.read_csv(PLAIN_PREFIX+NR_PREFIX+ROC_SUFFIX)

plot_roc([emb_roc_df, plain_roc_df], ['With embeddings', 'Without embeddings'], 'With vs. without embeddings with RA dataset - ROC Curves', 'ra_emb_roc.png')
plot_pr([emb_pr_df, plain_pr_df], ['With embeddings', 'Without embeddings'], 'With vs. without embeddings with RA dataset - PR Curves', 'ra_emb_pr.png')
plot_roc([emb_roc_df, emb_nr_roc_df], ['With ICs', 'Without ICs'], 'With vs. without ICs with RA dataset - With embeddings  - ROC Curves', 'ra_emb_ic_roc.png')
plot_pr([emb_pr_df, emb_nr_pr_df], ['With ICs', 'Without ICs'], 'With vs. without ICs with RA dataset - With embeddings  - PR Curves', 'ra_emb_ic_pr.png')
plot_roc([plain_roc_df, plain_nr_roc_df], ['With ICs', 'Without ICs'], 'With vs. without ICs with RA dataset - No embeddings - ROC Curves', 'ra_plain_ic_roc.png')
plot_pr([plain_pr_df, plain_nr_pr_df], ['With ICs', 'Without ICs'], 'With vs. without ICs with RA dataset - No embeddings - PR Curves', 'ra_plain_ic_pr.png')

In [None]:
ss = [500, 3000, 6000, 10000]
pr_list = []
roc_list = []
label_list = []

for s in ss:
    DIR = '/Users/wang/Documents/git/lets_learn/infclean/data/tsv/eval/pdb_params/ra/s/' + str(s) + '/'
    EMB_PREFIX = DIR + 'ra_emb_'
    PR_SUFFIX = 'pr.csv'
    ROC_SUFFIX = 'roc.csv'
    
    emb_pr_df = pd.read_csv(EMB_PREFIX+PR_SUFFIX)
    emb_roc_df = pd.read_csv(EMB_PREFIX+ROC_SUFFIX)
    pr_list.append(emb_pr_df)
    roc_list.append(emb_roc_df)
    label_list.append('$|tuples(D)|$ = ' + str(s))
    
plot_roc(roc_list, label_list, 'RA datasets of different sizes - ROC Curves', 'ra_emb_roc.png')
plot_pr(pr_list, label_list, 'RA datasets of different sizes - PR Curves', 'ra_emb_pr.png')

In [None]:
def plot_sp_pr(line_dfs, labels, title, filename, k_list):
    lw = 2
    plt.figure()
    for i, df in enumerate(line_dfs):
        #ppn = round(line_dfs[i]['precision'][len(line_dfs[i]['precision'])-1], 3)
        ppn = k_list[i] / (1 + k_list[i])
        #auc = trapz(df['precision'], df['recall'])
        plt.plot(df['recall'], df['precision'],
                 label=labels[i],# + ', AUC = ' + str(round(auc, 3)),
                 color=COLORS[i], linewidth=lw)
        plt.axhline(y=ppn, color=COLORS[i], linestyle='--', label='Random Classifier for '+labels[i])

    #plt.plot([0, 1], [0, 1], 'k--', lw=lw)
 #   ppn = round(line_dfs[0]['precision'][len(line_dfs[0]['precision'])-1], 3)
  #  plt.axhline(y=ppn, color='gray', linestyle='--', label='Random Classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.grid()
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(title)
    plt.legend(loc="upper right", prop={'size': 6})
    plt.savefig(filename, dpi=480)
    plt.show()

kk = [0.01, 0.05, 0.2, 0.4]
pr_list = []
roc_list = []
label_list = []

for k in kk:
    DIR = '/Users/wang/Documents/git/lets_learn/infclean/data/tsv/eval/pdb_params/ra/k/' + str(k) + '/'
    EMB_PREFIX = DIR + 'ra_emb_'
    PR_SUFFIX = 'pr.csv'
    ROC_SUFFIX = 'roc.csv'
    
    emb_pr_df = pd.read_csv(EMB_PREFIX+PR_SUFFIX)
    emb_roc_df = pd.read_csv(EMB_PREFIX+ROC_SUFFIX)
    pr_list.append(emb_pr_df)
    roc_list.append(emb_roc_df)
    label_list.append('$k / |tuples(D)|$ = ' + str(k))
    
plot_roc(roc_list, label_list, 'Effect of $k / |tuples(D)|$ - ROC Curves', 'ra_emb_roc.png')
plot_sp_pr(pr_list, label_list, 'Effect of $k / |tuples(D)|$ - PR Curves', 'ra_emb_pr.png', kk)

In [None]:
pp = [0.2, 0.5, 0.8, 1.0]
pr_list = []
roc_list = []
label_list = []

for p in pp:
    DIR = '/Users/wang/Documents/git/lets_learn/infclean/data/tsv/eval/pdb_params/ra/p_k005/' + str(p) + '/'
    EMB_PREFIX = DIR + 'ra_emb_'
    PR_SUFFIX = 'pr.csv'
    ROC_SUFFIX = 'roc.csv'
    
    emb_pr_df = pd.read_csv(EMB_PREFIX+PR_SUFFIX)
    emb_roc_df = pd.read_csv(EMB_PREFIX+ROC_SUFFIX)
    pr_list.append(emb_pr_df)
    roc_list.append(emb_roc_df)
    label_list.append('$p$ = ' + str(p))
    
plot_roc(roc_list, label_list, 'Effect of $p$ - ROC Curves', 'ra_emb_roc.png')
plot_pr(pr_list, label_list, 'Effect of $p$ - PR Curves', 'ra_emb_pr.png')

Plot Merc dataset evals

In [None]:
DIR = '/Users/wang/Documents/git/lets_learn/infclean/data/tsv/eval/correct_pdb_emb/merc/curves/'
EMB_PREFIX = DIR + 'ek_emb_'
PLAIN_PREFIX = DIR + 'ek_plain_'
NR_PREFIX = 'NR_'
PR_SUFFIX = 'pr.csv'
ROC_SUFFIX = 'roc.csv'

emb_pr_df = pd.read_csv(EMB_PREFIX+PR_SUFFIX)
emb_roc_df = pd.read_csv(EMB_PREFIX+ROC_SUFFIX)
plain_pr_df = pd.read_csv(PLAIN_PREFIX+PR_SUFFIX)
plain_roc_df = pd.read_csv(PLAIN_PREFIX+ROC_SUFFIX)
emb_nr_pr_df = pd.read_csv(EMB_PREFIX+NR_PREFIX+PR_SUFFIX)
emb_nr_roc_df = pd.read_csv(EMB_PREFIX+NR_PREFIX+ROC_SUFFIX)
plain_nr_pr_df = pd.read_csv(PLAIN_PREFIX+NR_PREFIX+PR_SUFFIX)
plain_nr_roc_df = pd.read_csv(PLAIN_PREFIX+NR_PREFIX+ROC_SUFFIX)


plot_roc([emb_roc_df, plain_roc_df], ['With embeddings', 'Without embeddings'], 'With vs. without embeddings with Mercateo dataset - ROC Curves', 'merc_emb_roc.png')
plot_pr([emb_pr_df, plain_pr_df], ['With embeddings', 'Without embeddings'], 'With vs. without embeddings with Mercateo dataset - PR Curves', 'merc_emb_pr.png')
plot_roc([emb_roc_df, emb_nr_roc_df], ['With ICs', 'Without ICs'], 'With vs. without ICs with Mercateo dataset - With embeddings  - ROC Curves', 'merc_emb_ic_roc.png')
plot_pr([emb_pr_df, emb_nr_pr_df], ['With ICs', 'Without ICs'], 'With vs. without ICs with Mercateo dataset - With embeddings  - PR Curves', 'merc_emb_ic_pr.png')
plot_roc([plain_roc_df, plain_nr_roc_df], ['With ICs', 'Without ICs'], 'With vs. without ICs with Mercateo dataset - No embeddings - ROC Curves', 'merc_plain_ic_roc.png')
plot_pr([plain_pr_df, plain_nr_pr_df], ['With ICs', 'Without ICs'], 'With vs. without ICs with Mercateo dataset - No embeddings - PR Curves', 'merc_plain_ic_pr.png')

In [None]:
edims = [5, 30, 100, 500]
pr_list = []
roc_list = []
label_list = []

for edim in edims:
    
    DIR = '/Users/wang/Documents/git/lets_learn/infclean/data/tsv/eval/tf_emb_model/dime/' + str(edim) + '/'
    EMB_PREFIX = DIR + 'ek_emb_'
    PR_SUFFIX = 'pr.csv'
    ROC_SUFFIX = 'roc.csv'
    
    emb_pr_df = pd.read_csv(EMB_PREFIX+PR_SUFFIX)
    emb_roc_df = pd.read_csv(EMB_PREFIX+ROC_SUFFIX)
    pr_list.append(emb_pr_df)
    roc_list.append(emb_roc_df)
    label_list.append('$dim_e$ = ' + str(edim))

plot_roc(roc_list, label_list, 'Effect of $dim_e$ - ROC Curves', 'ek_emb_roc.png')
plot_pr(pr_list, label_list, 'Effect of $dim_e$ - PR Curves', 'ek_emb_pr.png')

In [None]:
bss = [256, 1024, 4096, 10000]
pr_list = []
roc_list = []
label_list = []

for bs in bss:
    
    DIR = '/Users/wang/Documents/git/lets_learn/infclean/data/tsv/eval/tf_emb_model/batchsize/' + str(bs) + '/'
    EMB_PREFIX = DIR + 'ek_emb_'
    PR_SUFFIX = 'pr.csv'
    ROC_SUFFIX = 'roc.csv'
    
    emb_pr_df = pd.read_csv(EMB_PREFIX+PR_SUFFIX)
    emb_roc_df = pd.read_csv(EMB_PREFIX+ROC_SUFFIX)
    pr_list.append(emb_pr_df)
    roc_list.append(emb_roc_df)
    if bs == 10000:
        label_list.append('$batch\_size$ = $|tuples(D)|$')
    else:
        label_list.append('$batch\_size$ = ' + str(bs))

plot_roc(roc_list, label_list, 'Effect of $batch\_size$ - ROC Curves', 'ek_emb_roc.png')
plot_pr(pr_list, label_list, 'Effect of $batch\_size$ - PR Curves', 'ek_emb_pr.png')