In [None]:
import csv
import matplotlib.pyplot as plt
import numpy as np

from datetime import datetime

In [None]:
def read_data(dir):
    x = []
    y = []
    with open(dir + '/results.csv', 'r') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            x.append(datetime.strptime(row[0], "%Y-%m-%d %H:%M:%S"))
            if not y:
                [y.append([]) for i in range(len(row)-1)]
            [y[i].append(float(j)) for i, j in enumerate(row[1:])]
    return tuple([x] + y)

In [None]:
def smooth(y, N):
    cs = np.concatenate(([0], np.cumsum(y)))
    data = [(cs[i]-cs[i-N])/N for i in range(N, len(cs))]
    return data

## References distribution

### Static data

In [None]:
def plot_distribution(data, ax, title, legend=False, pos=(1.1, 1)):
    index = np.arange(0, len(data[0]), 1)
    p1 = ax.bar(index, data[1], width=1, color='#d8d2c4')
    p2 = ax.bar(index, data[2], width=1, bottom=data[1], color='#4f5858')
    p3 = ax.bar(index, data[3], width=1, bottom=[c1+c2 for c1, c2 in zip(data[1], data[2])], color='#3eb1c8')
    p4 = ax.bar(index, data[4], width=1, bottom=[c1+c2+c3 for c1, c2, c3 in zip(data[1], data[2], data[3])],
                color='#ffc72c')
    p5 = ax.bar(index, data[5], width=1,
                bottom=[c1+c2+c3+c4 for c1, c2, c3, c4 in zip(data[1], data[2], data[3], data[4])],
                color='#ef3340')
    ax.set_title(title)
    ax.set_xlim(index[0], index[len(index)-1])
    ax.set_ylim(0, 1)
    ax.set_xlabel('Days')
    if legend:
        ax.legend((p1[0], p2[0], p3[0], p4[0], p5[0]),
                  ('publisher', 'crossref unstructured', 'crossref structured', 'no match unstructured',
                   'no match structured'),
                  bbox_to_anchor=pos)

In [None]:
data = read_data('plugins/references_distribution_const')
plt.plot(data[0], data[1])
plt.ylabel('Number of references')
xticks = plt.xticks(rotation=90)

In [None]:
plot_distribution([data[0]] + list(data[2:]), plt.gca(), 'Static itemset', legend=True, pos=(1.1, 0.63))

### Varying data

In [None]:
data_raw = read_data('plugins/references_distribution')

N = 10
data_smooth = list(read_data('plugins/references_distribution'))
data_smooth[0] = data_smooth[0][(N-1):]
for i in range(1, len(data_smooth)):
    data_smooth[i] = smooth(data_smooth[i], N)

plt.rcParams.update({'font.size': 12})
f, axes = plt.subplots(1, 2, sharey=True, figsize=(12, 4))
plot_distribution(data_raw, axes[0], 'Raw')
axes[0].set_ylabel('Fraction')
plot_distribution(data_smooth, axes[1], 'Smoothed', True)

## Matcher evaluation

### Real data

In [None]:
def plot_eval(data, ax, title='', ylabel='', legend=False, xticks=True):
    ax.plot(data[0], data[1], color='#3eb1c8')
    ax.plot(data[0], data[2], color='#ffc72c')
    ax.plot(data[0], data[3], color='#ef3340')
    ax.set_title(title)
    ax.set_ylabel(ylabel)
    if xticks:
        for tick in ax.get_xticklabels():
            tick.set_rotation(90)
    else:
        ax.xaxis.set_ticks_position('none')
        ax.set_xticklabels([])
    if legend:
        ax.legend(['precision', 'recall', 'F1'], bbox_to_anchor=(0.4, 1.35), ncol=3)

plt.rcParams.update({'font.size': 12})
f, axes = plt.subplots(1, 2, sharey=True, figsize=(12, 4))
plot_eval(read_data('plugins/ref_matching_eval_real_python'), axes[0], title='Python impl')
plot_eval(read_data('plugins/ref_matching_eval_real_java'), axes[1], title='Java impl', legend=True)

### Artificial data

In [None]:
data_raw_python = read_data('plugins/ref_matching_eval_artificial_python')
N = 10
data_smooth_python = list(data_raw_python)
data_smooth_python[0] = data_smooth_python[0][(N-1):]
for i in range(1, len(data_smooth_python)):
    data_smooth_python[i] = smooth(data_smooth_python[i], N)
data_raw_java = read_data('plugins/ref_matching_eval_artificial_java')
data_smooth_java = list(data_raw_java)
data_smooth_java[0] = data_smooth_java[0][(N-1):]
for i in range(1, len(data_smooth_java)):
    data_smooth_java[i] = smooth(data_smooth_java[i], N)

plt.rcParams.update({'font.size': 12})
f, axes = plt.subplots(2, 2, sharey=True, figsize=(12, 8))
plot_eval(data_raw_python, axes[0,0], title='Python impl', ylabel='Raw', xticks=False)
plot_eval(data_raw_java, axes[0,1], title='Java impl', xticks=False, legend=True)
plot_eval(data_smooth_python, axes[1,0], ylabel='Smoothed')
plot_eval(data_smooth_java, axes[1,1])

## Thresholds

### Real data

In [None]:
def plot_threshold(data, plt, title=''):
    plt.plot(data[0], data[1], color='#3eb1c8')
    plt.set_title(title)
    for tick in plt.get_xticklabels():
        tick.set_rotation(90)

data_raw = read_data('plugins/ref_matching_threshold_real')
N = 9
data_smooth = list(data_raw)
data_smooth[0] = data_smooth[0][(N-1):]
for i in range(1, len(data_smooth)):
    data_smooth[i] = smooth(data_smooth[i], N)

plt.rcParams.update({'font.size': 12})
f, axes = plt.subplots(1, 2, sharey=True, figsize=(12, 4))
plot_threshold(data_raw, axes[0], title='Raw')
plot_threshold(data_smooth, axes[1], title='Smoothed')

### Artificial data

In [None]:
data_raw = read_data('plugins/ref_matching_threshold_artificial')
N = 10
data_smooth = list(data_raw)
data_smooth[0] = data_smooth[0][(N-1):]
for i in range(1, len(data_smooth)):
    data_smooth[i] = smooth(data_smooth[i], N)

f, axes = plt.subplots(1, 2, sharey=True, figsize=(12, 4))
plot_threshold(data_raw, axes[0], title='Raw')
plot_threshold(data_smooth, axes[1], title='Smoothed')