In [None]:
import warnings
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch

warnings.filterwarnings('ignore')

# base path of the results folder
BASE_PATH = '../'

In [None]:
def sort_df(df, method):
    df = df.copy()
    df['sort'] = 0
    for i in range(0, len(method)):
        df.loc[df['algorithm'].eq(method[i]), 'sort'] = i
    df = df.sort_values('sort', ascending=True)
    df = df.drop('sort', axis=1)
    return df

In [None]:
def create_chart(avg, dev, w_to_plot, pdfName, legpos, title=""):
    hatches = ['--', '///', '\\\\\\', 'ooo', 'xxx']
    wdf = sort_df(avg[avg['algorithm'].isin(w_to_plot)], w_to_plot)
    fields_to_plot = ['recall', 'precision', 'F1']
    fields_titles = ["Recall", "Precision", "F1"]
    fig, ax = plt.subplots(1, len(fields_to_plot))
    
    plt.suptitle(title, fontsize=20)
    
    fig.subplots_adjust(left=None, bottom=None, right=None, top=0.85, wspace=0.5, hspace=None)
    fig.set_size_inches(10,5)
    x = np.arange(1)
    s = 0
    methods = wdf['algorithm'].values

    for j in range(0, len(fields_to_plot)):
        val = wdf[fields_to_plot[j]].values
        #dev = ddf[fields_to_plot[j]].values

        for i in range(0, len(val)):
            ax[j].bar(x+s, val[i], hatch=hatches[i], width=0.08, color='#FFFFFF', alpha=1, align='center', edgecolor="black", label=methods[i])
            s += 0.1

        ax[j].set_title(fields_titles[j], fontsize=20)
        ax[j].set_xticklabels(('','','','','','','','',''))
        ax[j].yaxis.grid(True)
        ax[j].set_axisbelow(True)
        ax[j].set_xlabel("("+chr(ord('a')+j)+")", fontsize=20)
        #for tick in ax[j].yaxis.get_major_ticks():
            #tick.label.set_fontsize(15)
    
    ax[0].set_ylim([0.5,1])
    ax[0].set_yticks(np.arange(0.5,1.01,0.05))
    
    ax[1].set_ylim([0,0.30])
    ax[1].set_yticks(np.arange(0,0.31,0.05))
    
    ax[2].set_ylim([0,0.40])
    ax[2].set_yticks(np.arange(0,0.41,0.05))

    #(3.2, -0.3)
    #2.0
    ax[1].legend(bbox_to_anchor=legpos, loc='lower right', ncol=4, frameon = False, fontsize=20)
    #fig.savefig(pdfName+".pdf", format='pdf', dpi=1200, pad_inches=.05, bbox_inches="tight")
    plt.show()
    

# Section 5.1, Blocking Performance (Table 1a)

In [None]:
pd.read_csv(f'{BASE_PATH}results/01_blocking_performance.csv')

# Section 5.2, Pruning algorithm selection 

In [None]:
#Weight based
w_methods = ["bcl", "wep", "wnp", "rwnp", "blast"]
#Cardinality based
c_methods = ["cep", "cnp", "rcnp"]

In [None]:
df = pd.read_csv(f'{BASE_PATH}/results/algorithm_selection_java.csv', sep=";").drop(['dataset', 'train_size', 'conf_id', 'run', 'comparisons', 'matches'], axis=1)\
.groupby('algorithm').mean().reset_index()
df['algorithm'] = df['algorithm'].str.lower()

### Figure 6
Average performance of weight-based algorithms

In [None]:
create_chart(df, df, w_methods, "w_based", (2.6, -0.4), "")

### Figure 7
Average performance of cardinality-based algorithms

In [None]:
create_chart(df, df, c_methods, "c_based", (2.0, -0.3), "")

# Section 5.3, feature selection

In [None]:
# Read the results
df = pd.read_csv(f'{BASE_PATH}results/feature_selection_java.csv', sep=";")
df = df.drop(['dataset', 'train_size', 'run', 'matches', 'comparisons', 'RT'], axis=1).groupby(['algorithm', 'conf_id']).mean().reset_index()
# Rounding
df['recall'] = df['recall'].apply(lambda x: round(x, 3))
df['precision'] = df['precision'].apply(lambda x: round(x, 3))
df['F1'] = df['F1'].apply(lambda x: round(x, 3))

## Top blast features by F1 score (Table 2)
Those selected in the paper were [72, 74, 75, 78, 79, 82, 86, 89, 96, 190]

We can see that they are all there in the top-20

In [None]:
df[df['algorithm'] == 'blast'].sort_values('F1', ascending=False).head(20)

## Top RCNP features by F1 score (Table 3)
Those selected in the paper were [184, 187, 193, 200, 227, 228, 231, 235, 239, 250]

Due to the ties of the probabilities here we could obtain different results!

In [None]:
df[(df['algorithm'] == 'RCNP')].sort_values('F1', ascending=False).head(20)

In [None]:
# Those selected in the paper
top_blast_features = [72, 74, 75, 78, 79, 82, 86, 89, 96, 190]
top_rcnp_features = [184, 187, 193, 200, 227, 228, 231, 235, 239, 250]
top_blast = 78
top_rcnp = 187

In [None]:
def plot_run_time(df, title=""):
    
    hatches = ['--', '///', '\\\\\\', 'ooo', 'xxx', '...', 'OOO', '|||', '+++', '+o']
    fields_to_plot = ['RT']
    datasets = sorted(list(set(df['dataset'].values)))
    dataset_names = ["Movies", "WalmartAmazon"]
    fig, ax = plt.subplots(1, 2)
    plt.suptitle(title, fontsize=20)

    fig.subplots_adjust(left=None, bottom=None, right=None, top=0.85, wspace=0.5, hspace=None)
    fig.set_size_inches(10,5)
    x = np.arange(1)

    for j in range(0, len(datasets)):
        df1 = df[df['dataset']==datasets[j]].sort_values('conf_id')
        val = df1['RT'].values
        print(val)
        s = 0
        for i in range(0, len(val)):
            ax[j].bar(x+s, val[i], width=0.08, color='#FFFFFF', alpha=1, align='center', edgecolor="black", hatch=hatches[i])
            s += 0.1

        ax[j].set_title(dataset_names[j], fontsize=20)
        ax[j].set_xticks(np.arange(0, 1, 0.1))
        ax[j].set_xticklabels(df1['conf_id'].values, rotation=90)
        ax[j].yaxis.grid(True)
        ax[j].set_axisbelow(True)
        ax[j].set_xlabel("Feature set \n ("+chr(ord('a')+j)+")", fontsize=20)
        ax[j].set_ylabel("Running time (min)", fontsize=20)

    #ax[0].set_ylim([0,3.5])
    #ax[0].set_yticks(np.arange(0,3.51,0.5))

    #ax[1].set_ylim([0,3.5])
    #ax[1].set_yticks(np.arange(0,3.51,0.5))
    #fig.savefig(pdfName+".pdf", format='pdf', dpi=1200, pad_inches=.05, bbox_inches="tight")
    plt.show()

In [None]:
df = pd.read_csv(f'{BASE_PATH}results/feature_selection_java.csv', sep=";")
df = df[df['dataset'].isin(["Movies", "WalmartAmazon"])]\
  .drop(['comparisons', 'run', 'matches', 'train_size', 'precision', 'recall', 'F1'], axis=1)\
  .groupby(['dataset', 'conf_id', 'algorithm']).agg('mean').reset_index()
rts = pd.read_csv(f'{BASE_PATH}results/features_calc_time.csv', sep=";")
rts = rts.drop('features', axis=1)
data = rts.merge(df)
data['RT'] = (data['RT']+data['blockingTime']+data['featuresTime'])/60.0
data = data.drop(['blockingTime', 'featuresTime'], axis=1) 

## BLAST selected features runtime (Figure 8)
In the original paper the fastest feature set was 78.

Here we have different results, this heavily depends from the used CPU/memory settings.

In [None]:
plot_run_time(data[(data['algorithm']=='blast') & (data['conf_id'].isin(top_blast_features))], "BLAST Run time")

## RCNP selected features runtime (Figure 9)
In the original paper the fastest feature set was 187.

Here we have different results, this heavily depends from the used CPU/memory settings.

In [None]:
plot_run_time(data[(data['algorithm']=='RCNP') & (data['conf_id'].isin(top_rcnp_features))], "RCNP Run time")

## Comparison with Supervised meta-blocking

In [None]:
avg = pd.read_csv(f'{BASE_PATH}/results/algorithm_selection_java.csv', sep=";")\
      .fillna(1e-6)\
      .drop(['dataset', 'train_size', 'run'], axis=1)\
      .groupby('algorithm').mean().reset_index()
avg['feature_set_id'] = 128
avg['algorithm'] = avg['algorithm'].str.lower()
avg = avg[['algorithm', 'recall', 'precision', 'F1', 'conf_id']]

df = pd.read_csv(f'{BASE_PATH}/results/feature_selection_java.csv', sep=";")\
     .fillna(1e-6)\
     .drop(['dataset', 'train_size'], axis=1)\
     .groupby(['algorithm', 'conf_id']).mean().reset_index()
df['algorithm'] = df['algorithm'].str.lower()
df = df[['algorithm', 'recall', 'precision', 'F1', 'conf_id', 'RT']]


res = pd.concat([avg[avg['algorithm'].isin(['bcl', 'cnp'])],
           df[(df['algorithm'] == 'blast') & (df['conf_id']==top_blast)],
           df[(df['algorithm'] == 'rcnp') & (df['conf_id']==top_rcnp)]])
res = res.fillna(0)

### Figure 10
Here the results are different, BCL has a higher Precision than in the original experiments

In [None]:
create_chart(res, [], ["bcl", "blast", "cnp", "rcnp"], "", (2.4, -0.3), title="")

### Figure 11
Again, RT is completely different than those in the original paper

In [None]:
rt = pd.read_csv(f'{BASE_PATH}/results/features_calc_time.csv', sep=";")

In [None]:
data = rt.drop(['features'], axis=1).merge(res)

data['RT'] = data['RT']+data['blockingTime']+data['featuresTime']
data = data.drop(['conf_id', 'blockingTime', 'featuresTime', 'recall', 'precision', 'F1'], axis=1)
data['RT'] /= 60

In [None]:
hatches = ['--', '///', '\\\\\\', 'ooo', 'xxx', '...', 'OOO', '|||', '+++', '+o']
fields_to_plot = ['Overall time (min)']
datasets = sorted(list(set(data['dataset'].values)))
dataset_names = ["Movies", "WalmartAmazon"]
fig, ax = plt.subplots(1, 2)
#plt.suptitle(title, fontsize=20)

fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=None)
fig.set_size_inches(10,5)
x = np.arange(1)

for j in range(0, len(datasets)):
    df1 = data[data['dataset']==datasets[j]].sort_values('algorithm')
    val = df1['RT'].values
    s = 0
    for i in range(0, len(val)):
        ax[j].bar(x+s, val[i], width=0.08, color='#FFFFFF', alpha=1, align='center', edgecolor="black", hatch=hatches[i])
        s += 0.1

    ax[j].set_title(dataset_names[j], fontsize=20)
    ax[j].set_xticks(np.arange(0, 0.4, 0.1))
    ax[j].set_xticklabels(df1['algorithm'].values, rotation=0)
    ax[j].yaxis.grid(True)
    ax[j].set_axisbelow(True)
    ax[j].set_xlabel("("+chr(ord('a')+j)+")", fontsize=20)
    ax[j].set_ylabel("Running time (min)", fontsize=20)

plt.show()

# Section 5.4 Training set size selection

In [None]:
df = pd.read_csv(f'{BASE_PATH}/results/train_size_selection_java.csv', sep=";")\
.drop(['dataset', 'conf_id', 'run', 'matches', 'comparisons'], axis=1)\
.groupby(['algorithm', 'train_size']).mean()\
.reset_index()

In [None]:
def plot_train_change(df, algorithm, title=""):
    top_feat = df[df['algorithm']==algorithm]

    hatches = ['--', '///', '\\\\\\', 'ooo', 'xxx', '...', 'OOO', '|||', '+++', '+o']
    fields_to_plot = ['recall', 'precision', 'F1']
    names = ['Recall', 'Precision', 'F1']
    fig, ax = plt.subplots(1, len(fields_to_plot))

    plt.suptitle(title, fontsize=20)

    fig.subplots_adjust(left=None, bottom=None, right=None, top=0.75, wspace=0.5, hspace=None)
    fig.set_size_inches(10,2.5)
    x = np.arange(1)

    for j in range(0, len(fields_to_plot)):
        val = top_feat[fields_to_plot[j]].values
        s = 0
        ax[j].plot(np.arange(0, 1.1, 0.1), val, marker='o', color='black')
        #for i in range(0, len(val)):
        #    ax[j].bar(x+s, val[i], width=0.08, color='#FFFFFF', alpha=1, align='center', edgecolor="black", hatch=hatches[i])
        #    s += 0.1

        ax[j].set_title(names[j], fontsize=20)
        ax[j].set_xticks(np.arange(0, 1.1, 0.1))
        ax[j].set_xticklabels(sorted(list(set(df['train_size'].values))), rotation=90)
        ax[j].yaxis.grid(True)
        ax[j].set_axisbelow(True)
        ax[j].set_xlabel("Training set size \n ("+chr(ord('a')+j)+")", fontsize=15)

    #ax[0].set_ylim([0.6,1])
    #ax[0].set_yticks(np.arange(0.6,1.01,0.05))

    #ax[1].set_ylim([0.15,0.35])
    #ax[1].set_yticks(np.arange(0.15,0.36,0.05))

    #ax[2].set_ylim([0.25,0.45])
    #ax[2].set_yticks(np.arange(0.25,0.451,0.05))

    plt.show()

## Figure 12 - Training set size effects on BLAST

In [None]:
plot_train_change(df, 'blast')

## Figure 13 - Training set size effects on RCNP

In [None]:
plot_train_change(df, 'RCNP')

### Figure 14 - BLAST Threshold

In [None]:
def get_df(dataset, train_size):
    global BASE_PATH
    paths = glob.glob(f'{BASE_PATH}probabilities/{dataset}/{train_size}/{dataset}_fs78_*.parquet')

    df = pd.read_parquet(paths[0])
    for i in range(1, len(paths)):
        df = pd.concat([df, pd.read_parquet(paths[i])])
        
    df = df[['p1', 'p2', 'p_match', 'is_match']].groupby(['p1', 'p2']).mean().reset_index()
    df = df[df['p_match'] >= 0.5]
    df = df[['p_match', 'is_match']]
    df['train_size'] = train_size/100
    return df
    
def create_chart(dataset):
    train_sizes = [20, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
    fig, ax = plt.subplots(1, 1)
    fig.set_size_inches(10, 5)

    x_ax = np.arange(0.1, 1.2, 0.1)
    i = 0.1
    for train_size in train_sizes:
        df = get_df(dataset, train_size)
        df['train_size'] = i
        matches = df[df['is_match'] == 1]
        non_matches = df[df['is_match'] == 0]
        ax.scatter(x=non_matches['train_size'], y=non_matches['p_match'], marker="x", alpha=0.1, color='blue')
        ax.scatter(x=matches['train_size'], y=matches['p_match'], marker="x", alpha=0.1, color='red')
        i += 0.1

    ax.set_title(dataset+" (blast)", fontsize=20)
    ax.yaxis.grid(True)
    ax.set_axisbelow(True)
    ax.set_xlabel("Training set size", fontsize=20)
    ax.set_xticks(x_ax)
    ax.set_xticklabels(train_sizes)
    ax.set_axisbelow(True)
    ax.set_ylabel("Probabilities", fontsize=20)

    thres = pd.read_csv(f'{BASE_PATH}results/blast_thresholds.csv', sep=";")
    t = thres[(thres['dataset']==dataset) & (thres['feature_set_id']==78)].groupby(['dataset', 'feature_set_id', 'train_size', 'algorithm']).agg('mean').reset_index()
    t = t[t['train_size'].isin(train_sizes)].sort_values('train_size')
    ax.plot(x_ax, t['avg'], lw=3, color='lightgreen')
    ax.plot(x_ax, t['max'], lw=3, color='darkgreen')



    legend_elements = [Line2D([0], [0], lw=3, color='lightgreen', label="Avg threshold"),
                       Line2D([0], [0], lw=3, color='darkgreen', label="Max threshold"),
                       Patch(facecolor='red', edgecolor='r', label='Matches'),
                       Patch(facecolor='blue', edgecolor='b', label='Non matches')
                      ]

    ax.legend(bbox_to_anchor=(0.5, -0.25), handles=legend_elements, loc='lower center', ncol=4, fontsize=12)
    
    plt.plot()

In [None]:
create_chart('AbtBuy')

### Figure 15 - Comparison with BCl

In [None]:
fig, ax = plt.subplots(1, 2)
fig.subplots_adjust(left=None, bottom=None, right=None, top=0.75, wspace=0.3, hspace=None)
fig.set_size_inches(10,5)
x = np.arange(0, 1.1, 0.1)
ax[0].set_title("Recall", fontsize=20)
ax[1].set_title("Precision", fontsize=20)


ax[0].set_xticks(x)
ax[0].set_xticklabels([20, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500], rotation=90)
ax[0].yaxis.grid(True)
ax[0].set_axisbelow(True)
ax[0].set_xlabel("Training set size \n (a)", fontsize=15)

ax[1].set_xticks(x)
ax[1].set_xticklabels([20, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500], rotation=90)
ax[1].set_xlabel("Training set size \n (b)", fontsize=15)
ax[1].yaxis.grid(True)
ax[1].set_axisbelow(True)

pc_bcl = df[df['algorithm']=="bcl"].sort_values('train_size')['recall'].values
pc_blast = df[df['algorithm']=="blast"].sort_values('train_size')['recall'].values

pq_bcl = df[df['algorithm']=="bcl"].sort_values('train_size')['precision'].values
pq_blast = df[df['algorithm']=="blast"].sort_values('train_size')['precision'].values


ax[0].plot(x, pc_blast, marker="o", color="blue", label="BLAST")
ax[1].plot(x, pq_blast, marker="o", color="blue", label="BLAST")

ax[0].plot(x, pc_bcl, marker="*", color="red", label="BCl")
ax[1].plot(x, pq_bcl, marker="*", color="red", label="BCl")


ax[0].set_ylim([0.75, 1])
ax[0].set_yticks(np.arange(0.75,1.01,0.05))

custom_lines = [Line2D([0], [0], color='blue', marker="o"),
                Line2D([0], [0], color='red', marker="*")]

ax[0].legend(custom_lines, ['Blast', 'BCl'], bbox_to_anchor=(1.7, -0.5), loc='lower right',
          ncol=4, frameon = False, fontsize=20)

plt.show()

# Figures 17 and 18
Distibution of common blocks.

In [None]:
datasets = ["DblpAcm", "ScholarDblp", "WalmartAmazon", "Movies", "AbtBuy", "AmazonGP", "ImdbTmdb", "ImdbTvdb", "TmdbTvdb"]

In [None]:
df = pd.DataFrame(columns=['dataset', 'cbs', 'pairs'])
for d in datasets:
    try:
        dft = pd.read_csv(f'{BASE_PATH}cbs_stats/{d}.csv')
        dft = dft.astype('int')
        dft = dft.rename({'CBS':'cbs', 'cnt':'pairs'}, axis=1)
        dft['dataset'] = d
        df = pd.concat([df, dft])
    except:
        pass

In [None]:
overt_t = ["DblpAcm", "ScholarDblp", "WalmartAmazon", "Movies"]
under_t = ["AbtBuy", "AmazonGP", "ImdbTmdb", "ImdbTvdb", "TmdbTvdb"]

In [None]:
def plot(df, ax, title, let=0):
    df = df.copy()
    tot = df['pairs'].sum()
    df['pairs'] /= tot
    df['pairs'] *= 100
    df = df.sort_values('cbs')
    
    x = np.arange(0, round(len(df['cbs'])*0.2, 2), 0.2)
    ax.bar(x, df['pairs'], width=0.08, color='grey', alpha=1, align='center', edgecolor="black")
    
    ax.set_title(title, fontsize=40)
    ax.set_xticks(x)
    ax.set_xticklabels(df['cbs'].values, rotation=90)
    ax.yaxis.grid(True)
    ax.set_axisbelow(True)
    ax.set_ylabel("Portion of \nmatching pairs (%)", fontsize=40)
    ax.set_xlabel("Number of common blocks\n("+chr(ord('a')+let)+")", fontsize=40)
    
    base = 5 
    max_y = base * round(df['pairs'].max()/base)
    if max_y < df['pairs'].max():
        max_y += base
    max_y += 0.1
     
    ax.set_yticks(np.arange(0, max_y, 5))
    ax.set_xticklabels(df['cbs'].values)
    pass


def plotFig(dati, datasets, pdfName):
    fig, ax = plt.subplots(2, 2)
    fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.2, hspace=0.45)
    fig.set_size_inches(40, 20)
    
    d = 0
    for i in range(0, 2):
        for j in range(0, 2):
            plot(dati[dati['dataset']==datasets[d]], ax[i][j], datasets[d], let=d)
            d += 1
        
    #fig.savefig(pdfName+".pdf", format='pdf', dpi=1200, pad_inches=.05, bbox_inches="tight")
    plt.show()

In [None]:
plotFig(df, overt_t, "dist_over_t")

In [None]:
plotFig(df, under_t, "dist_under_t")

In [None]:
fig, ax = plt.subplots(1, 1)
fig.set_size_inches(40, 20)
plot(df[df['dataset']=="TmdbTvdb"], ax, "TmdbTvdb", let=4)
plt.show()

# Scalability analysis.
Figures 16 and 19

In [None]:
def scalability_plot(df, title, algo, figname, letter, legpos=(1.5,-0.9)):
    markers = ['o', 'v']
    markers = ['s', "D"]
    colors = ['black', 'dimgrey']
    colors = ['darkgreen', 'navy']
    fields_to_plot = ['recall', 'precision', 'F1']
    names = ['Recall', 'Precision', 'F1']
    df_order = [('d10K', 0), ('d50K', 1), ('d100K', 2), ('d200K', 3), ('d300K', 4)]
    dfo = pd.DataFrame(df_order, columns=['dataset', 'order'])
    fig, ax = plt.subplots(1, len(fields_to_plot))

    #plt.suptitle(title, fontsize=20)

    fig.subplots_adjust(left=None, bottom=None, right=None, top=0.75, wspace=0.5, hspace=None)
    fig.set_size_inches(10,2.5)
    x = np.arange(1)


    for i in range(0, len(algo)):
        df1 = pd.merge(df[df['algorithm']==algo[i]], dfo).sort_values('order')
        for j in range(0, len(fields_to_plot)):    
            val = df1[fields_to_plot[j]].values
            #print(val)
            #if i > 0:
            #    lbl = algo[i]
            #else:
            #    lbl = letter + "     " + algo[i]
            ax[j].plot(np.arange(0, 0.5, 0.1), val, marker=markers[i], color=colors[i], label=algo[i], markersize=8)

            ax[j].set_title(names[j], fontsize=20)
            ax[j].set_xticks(np.arange(0, 0.5, 0.1))
            ax[j].set_xticklabels(df1['dataset'].values, rotation=90)
            ax[j].yaxis.grid(True)
            ax[j].set_axisbelow(True)

    ax[0].set_ylim([0.0,1.01])
    ax[0].set_yticks(np.arange(0.0,1.02,0.1))

    ax[1].set_ylim([0,0.80])
    ax[1].set_yticks(np.arange(0,0.81,0.1))
    ax[1].set_xlabel(letter, fontsize=20)
    #fig.text(0.25, -0.3, letter, fontsize=20, transform=plt.gcf().transFigure)

    ax[2].set_ylim([0,0.85])
    ax[2].set_yticks(np.arange(0,0.85,0.1))

    ax[1].legend(bbox_to_anchor=legpos, loc='lower right', ncol=4, frameon = False, fontsize=20)
    
    plt.show()

In [None]:
df = pd.read_csv(f'{BASE_PATH}results/scalability.csv', sep=";")
df = df.drop(['train_size', 'conf_id', 'run'], axis=1)
df = df.groupby(['dataset', 'algorithm']).mean()
df = df.reset_index()

In [None]:
title = "Weight-based Pruning Algorithms"
algo = ['blast', 'bcl']
scalability_plot(df, title, algo, "w_based_scalability", "(a)")

In [None]:
title = "Cardinality-based Pruning Algorithms"
algo = ['CNP', 'RCNP']
scalability_plot(df, title, algo, "c_based_scalability", "(b)")

In [None]:
def calc_speedup(dsm, dlg):
    return (dlg['comparisons']/dsm['comparisons'])*(dsm['RT']/dlg['RT'])

speedups = pd.DataFrame()
for d in set(df['algorithm'].values):
    sm = df[(df['algorithm']==d) & (df['dataset']=='d10K')]
    res = df[(df['algorithm']==d) & (df['dataset']!='d10K')]
    res['speedup'] = res.apply(lambda x: calc_speedup(sm, x), axis=1)
    res = res.set_index('dataset')
    speedups[d] = res['speedup']

df_order = [('d50K', 1), ('d100K', 2), ('d200K', 3), ('d300K', 4)]
dfo = pd.DataFrame(df_order, columns=['dataset', 'order'])
speedups = pd.merge(speedups.reset_index(), dfo).sort_values('order')

In [None]:
df = speedups
markers = ['o', 'v', 's', "D"]
colors = ['black', 'dimgrey', 'darkgreen', 'navy']
fields_to_plot = ["blast", "bcl", "RCNP", "CNP"]
labels = ["BLAST", "BCl", "RCNP", "CNP"]
fig, ax = plt.subplots(1, 1)

fig.set_size_inches(10,4)
x = np.arange(1)

for i in range(0, len(fields_to_plot)):
    val = df[fields_to_plot[i]].values
    ax.plot(np.arange(0, 0.4, 0.1), val, marker=markers[i], color=colors[i], label=labels[i], lw=4, markersize=10, alpha=0.7)

    ax.set_xticks(np.arange(0, 0.4, 0.1))
    ax.set_xticklabels(df['dataset'].values, rotation=90)
    ax.yaxis.grid(True)
    ax.set_axisbelow(True)
    #ax[j].set_xlabel("Dataset \n ("+chr(ord('a')+j)+")", fontsize=15)
        
    fig.text(0.45, -0.45, "(c)", fontsize=25, transform=plt.gcf().transFigure)
    fig.text(0.45, -0.55, ".", color="white", fontsize=25, transform=plt.gcf().transFigure)

#ax.set_ylim([0.9,1.09])
#ax.set_yticks(np.arange(0.9,1.10,0.05))
#ax.set_xlabel("\n\n(c)", fontsize=20)

ax.legend(bbox_to_anchor=(1.05,-0.7), loc='lower right', ncol=4, frameon = False, fontsize=23)

plt.show()

# Table 4

In [None]:
df = pd.read_csv(f'{BASE_PATH}results/table4.csv', sep=";").drop(['matches', 'run', 'comparisons'], axis=1)
df = df.groupby(['dataset', 'train_size', 'conf_id', 'algorithm']).mean().reset_index()

In [None]:
def create_table(df, algorithm, conf_id, train_size, title):
    print(title)
    tmp = df[(df['conf_id']==conf_id) & (df['algorithm']==algorithm) & (df['train_size']==train_size)]\
            .drop(['train_size', 'conf_id', 'algorithm'], axis=1)\
            .set_index('dataset').transpose()
    tmp['avg'] = (tmp.apply(lambda x: x.mean(), axis=1))
    display(tmp)

In [None]:
create_table(df, 'blast', 78, 50, '(a) BLAST with 50 labelled pairs and {𝐶𝐹-𝐼𝐵𝐹,𝑅𝐴𝐶𝐶𝐵,𝑅𝑆,𝑁𝑅𝑆}')

In [None]:
create_table(df, 'bcl', 78, 50, '(b) BCl1 with 50 labelled pairs and {𝐶𝐹-𝐼𝐵𝐹,𝑅𝐴𝐶𝐶𝐵,𝑅𝑆,𝑁𝑅𝑆}')

In [None]:
create_table(df, 'bcl', 128, 500, '(c) BCl2 with the training set and the features of [13], i.e., {𝐶𝐹-𝐼𝐵𝐹,𝑅𝐴𝐶𝐶𝐵, 𝐽𝑆,𝐿𝐶𝑃 }')

In [None]:
create_table(df, 'RCNP', 187, 50, '(d) RCNP with 50 labelled pairs and {𝐶𝐹-𝐼𝐵𝐹,𝑅𝐴𝐶𝐶𝐵, 𝐽𝑆,𝐿𝐶𝑃 ,𝑊𝐽𝑆}')

In [None]:
create_table(df, 'CNP', 187, 50, '(e) CNP1 with 50 labelled pairs and {𝐶𝐹-𝐼𝐵𝐹,𝑅𝐴𝐶𝐶𝐵, 𝐽𝑆,𝐿𝐶𝑃 ,𝑊𝐽𝑆}')

In [None]:
create_table(df, 'CNP', 128, 500, '(f) CNP2 with the training set and the features of [13], i.e., {𝐶𝐹-𝐼𝐵𝐹,𝑅𝐴𝐶𝐶𝐵, 𝐽𝑆,𝐿𝐶𝑃 }')

# Progressive experiments 
Plot Figures 20 and 21

In [None]:
def plot_clean_progressive(df, datasets, d_names, figname):
    df3 = df[df['method'] != "sup_mb"]
    df4 = df3.drop('method', axis=1).groupby('dataset').mean().reset_index()
    df4['method'] = 'others'
    df4 = pd.concat([df4, df[df['method']=="sup_mb"]], ignore_index=True)
    
    fig, ax = plt.subplots(1, len(datasets))
    fig.set_size_inches(20, 2.5)
    fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=None)
    aucs = [1, 5, 10, 20]
    methods = ["sup_mb", "others"]
    methods_names = ["PPS + Gen. Sup. MB", "PPS"]


    x = np.array([0, 4, 8, 12])
    w = 1
    for i in range(0, len(datasets)):
        ax[i].set_title(d_names[i], fontsize=15)

        for j in range (0, len(methods)):
            data = df4[(df4['method']==methods[j]) & (df4['dataset']==datasets[i])][['auc_1', 'auc_5', 'auc_10', 'auc_20']].values            
            if j == 0:
                ax[i].bar(x+w/2, data[0], width=w, label=methods_names[j], edgecolor='black')
            else:
                ax[i].bar(x-w/2, data[0], width=w, label=methods_names[j], edgecolor='black')

        ax[i].set_xticks(x)
        ax[i].set_yticks(np.arange(0, 1.01, 0.2))
        ax[i].set_yticklabels([0.0, 0.2, 0.4, 0.6, 0.8, 1.0], fontsize=15)
        ax[i].set_xticklabels(aucs, fontsize=15)
        ax[i].set_axisbelow(True)
        ax[i].yaxis.grid(True)
        ax[i].set_xlabel('$ec*$', fontsize=15)
    ax[0].set_ylabel('$AUC_m^*$', fontsize=15)
    
    if len(datasets) > 7:
        ax[4].legend(bbox_to_anchor=(2.3, -0.55), loc='lower right', ncol=3, frameon = False, fontsize=15)
    else:
        ax[2].legend(bbox_to_anchor=(1.4, -0.55), loc='lower right', ncol=3, frameon = False, fontsize=15)
    plt.show()

    #fig.savefig(f"{figname}.pdf", format='pdf', dpi=1200, pad_inches=.05, bbox_inches="tight")

def plot_dirty_progressive(df_dirty, dirty_datasets, dirty_names):
    others = df_dirty[df_dirty['method'] != "sup_mb"]
    others = df_dirty[df_dirty['method'] != "sup_mb"]
    others = others.drop('method', axis=1).groupby('dataset').mean().reset_index()
    others['method'] = 'others'
    sup_mb = df_dirty[df_dirty['method']=="sup_mb"].reset_index().drop('index', axis=1)
    
    others['order'] = 0
    sup_mb['order'] = 0
    for i in range(0, len(dirty_datasets)):
        others.loc[others['dataset']==dirty_datasets[i], 'order'] = i
        sup_mb.loc[sup_mb['dataset']==dirty_datasets[i], 'order'] = i
    sup_mb = sup_mb.sort_values('order')
    others = others.sort_values('order')
    
    aucs = [1, 5, 10, 20]
    methods = ["sup_mb", "others"]
    methods_names = ["PPS + Gen. Sup. MB", "PPS"]
    
    fig, ax = plt.subplots(1, len(aucs))
    fig.set_size_inches(20, 2.5)
    fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.3, hspace=None)
    
    x = np.arange(0, len(dirty_datasets))
    
    for i in range(0, len(aucs)):
        ax[i].plot(x, sup_mb[f'auc_{aucs[i]}'].values, marker='o', label="PPS + Gen. Sup. MB")
        ax[i].plot(x, others[f'auc_{aucs[i]}'].values, marker='o', label="PPS")
        ax[i].set_yticks(np.arange(0, 1.01, 0.2))
        ax[i].set_yticklabels([0.0, 0.2, 0.4, 0.6, 0.8, 1.0], fontsize=15)
        ax[i].set_xticks(x)
        ax[i].set_xticklabels(dirty_datasets, fontsize=15)
        ax[i].set_axisbelow(True)
        ax[i].yaxis.grid(True)
        ax[i].set_title(f'$AUC^*_m@{aucs[i]}$', fontsize=15)
        ax[i].set_xlabel("\n("+chr(ord('a')+i)+")", fontsize=20)
        
    ax[1].legend(bbox_to_anchor=(1.85, -0.8), loc='lower right', ncol=3, frameon = False, fontsize=15)
    
    #fig.savefig("dirty_2.pdf", format='pdf', dpi=1200, pad_inches=.05, bbox_inches="tight")
    plt.show()

In [None]:
clean_datasets = ["AbtBuy", "DblpAcm", "ScholarDblp", "AmazonGP", "ImdbTmdb", "ImdbTvdb", "TmdbTvdb", "Movies", "WalmartAmazon"]
clean_names = ["AbtBuy", "DblpAcm", "ScholarDblp", "AmazonGP", "ImdbTmdb", "ImdbTvdb", "TmdbTvdb", "Movies", "WalmartAmazon"]

df_clean = pd.read_csv(f'{BASE_PATH}results/progressive_clean_results.csv', sep=";")
plot_clean_progressive(df_clean, clean_datasets, clean_names, "progressive_clean")

In [None]:
dirty_datasets = ["d10K", "d50K", "d100K", "d200K", "d300K"]
dirty_names = ["D10K", "D50K", "D100K", "D200K", "D300K"]
df_dirty = pd.read_csv(f'{BASE_PATH}results/progressive_dirty_results.csv', sep=";")
plot_dirty_progressive(df_dirty, dirty_datasets, dirty_names)

# Table 5
Comparison with others blocking frameworks

In [None]:
df = pd.read_csv(f'{BASE_PATH}results/table4.csv', sep=";").drop(['matches', 'run', 'comparisons'], axis=1)
df = df.groupby(['dataset', 'train_size', 'conf_id', 'algorithm']).mean().reset_index()

In [None]:
create_table(df, 'blast', 78, 50, '(a) BLAST with 50 labelled pairs and {𝐶𝐹-𝐼𝐵𝐹,𝑅𝐴𝐶𝐶𝐵,𝑅𝑆,𝑁𝑅𝑆}')

In [None]:
create_table(df, 'RCNP', 187, 50, '(b) RCNP with 50 labelled pairs and {𝐶𝐹-𝐼𝐵𝐹,𝑅𝐴𝐶𝐶𝐵, 𝐽𝑆,𝐿𝐶𝑃 ,𝑊𝐽𝑆}')

In [None]:
df = pd.read_csv(f'{BASE_PATH}results/deepblocker.csv', sep=";")
tmp = df.drop(['k', 'model_name', 'candidates', 'train_time', 'block_time'], axis=1).set_index('dataset').transpose()
tmp['avg'] = (tmp.apply(lambda x: x.mean(), axis=1))
print('(c) DeepBlocker')
tmp

In [None]:
df = pd.read_csv(f'{BASE_PATH}results/sudowoodo_results_50.csv', sep=";")
tmp = df.drop(['train_time', 'block_time'], axis=1).set_index('dataset').transpose()
tmp['avg'] = (tmp.apply(lambda x: x.mean(), axis=1))
print("(d) Sudowoodo with 50 labelled pairs")
tmp

In [None]:
df = pd.read_csv(f'{BASE_PATH}results/sudowoodo_results_500.csv', sep=";")
tmp = df.drop(['train_time', 'block_time'], axis=1).set_index('dataset').transpose()
tmp['avg'] = (tmp.apply(lambda x: x.mean(), axis=1))
print("(e) Sudowoodo with 500 labelled pairs")
tmp

In [None]:
df = pd.read_csv(f'{BASE_PATH}results/contextual_blocker.csv', sep=";")
tmp = df.set_index('dataset').transpose()
tmp['avg'] = (tmp.apply(lambda x: x.mean(), axis=1))
print("(f) ContextualBlocker")
tmp

In [None]:
df = pd.read_csv(f'{BASE_PATH}results/sparkly_k_best.csv', sep=";")
tmp = df.drop('k', axis=1).set_index('dataset').transpose()
tmp['avg'] = (tmp.apply(lambda x: x.mean(), axis=1))
print("(g) Sparkly with fine-tuned k")
tmp

In [None]:
df = pd.read_csv(f'{BASE_PATH}results/sparkly_k10.csv', sep=";")
tmp = df.drop('k', axis=1).set_index('dataset').transpose()
tmp['avg'] = (tmp.apply(lambda x: x.mean(), axis=1))
print("(h) Sparkly with k = 10")
tmp