# Summary statistics figure

In [1]:
%matplotlib notebook

import numpy as np
import pandas as pd
import pylab as plt

plt.style.use('matplotlib_style.txt')

In [2]:
# Load the data

iclr = pd.read_parquet('../data/iclr24v2a.parquet')

genders = pd.read_parquet('../results/variables/iclr24v2a/names_and_genders.parquet')

In [3]:
# Compute gender ratios

years = np.arange(2017, 2025)

gender_ratio_first = np.zeros_like(years).astype(float)
gender_ratio_last = np.zeros_like(years).astype(float)

for i, year in enumerate(years):
    gender_ratio_first[i] = np.sum(
        genders[iclr.year == year]["first_author_gender"].values == "female"
    ) / np.sum(
        np.isin(
            genders[iclr.year == year]["first_author_gender"].values, ["female", "male"]
        )
    )

    gender_ratio_last[i] = np.sum(
        genders[iclr.year == year]["last_author_gender"].values == "female"
    ) / np.sum(
        np.isin(
            genders[iclr.year == year]["last_author_gender"].values, ["female", "male"]
        )
    )

In [4]:
fig, axs = plt.subplots(ncols=4, figsize=(6.0, 1.2))

years = np.arange(2017, 2025)
n_submissions = [np.sum(iclr.year==y) for y in years]

axs[0].plot(years, np.array(n_submissions) / 1000, '.-')
axs[0].set_ylabel(f'Submissions (thousands)')
axs[0].set_ylim([0, 8])

accept_rate = [np.mean([d[:3]=='Acc' for d in iclr[iclr.year==y].decision]) for y in years]

axs[1].plot(years, accept_rate, '.-')
axs[1].set_ylabel('Acceptance rate')
axs[1].set_ylim([0, 1])
axs[1].set_xlim([2016.5, 2024.5])
    
axs[2].plot(years, gender_ratio_first, '.-', label='First authors')
axs[2].plot(years, gender_ratio_last, '.--', label='Last authors')
axs[2].set_ylabel('Inferred female ratio')
axs[2].set_ylim([0, .22])
axs[2].set_xlim([2016.5, 2024.5])
axs[2].legend()

len_abstract = [len(a) for a in iclr.abstract]
len_title =    [len(t) for t in iclr.title]

axs[3].scatter(len_abstract, len_title, s=1, ec='none', rasterized=True)
axs[3].set_xscale('log')
axs[3].set_yscale('log')
axs[3].set_xlim([100, 10000])
axs[3].set_ylim([5, 500])

import matplotlib.ticker as mticker
axs[3].xaxis.set_major_formatter(mticker.ScalarFormatter())
axs[3].yaxis.set_major_formatter(mticker.ScalarFormatter())

axs[3].text(1000, 6, 'Abstract length (char)', ha='center')
axs[3].text(120, 50, 'Title length (char)', va='center', rotation=90)

plt.savefig('../results/figures/final_figures/summary-stats.png', dpi=300)
plt.savefig('../results/figures/final_figures/summary-stats.pdf', dpi=300)

<IPython.core.display.Javascript object>

In [5]:
# Average acceptance rate

acc = np.mean([d[:3]=='Acc' for d in iclr[iclr.year < 2024].decision])
print(f'{acc:.2f}')

0.31


In [6]:
# Score consistency

pairs = []

for score in iclr.scores:
    for i in range(len(score)):
        for j in range(len(score)):
            if i != j:
                pairs.append([score[i], score[j]])
                
pairs = np.array(pairs)
ns = np.array([len(s) for s in iclr.scores])

print(ns[ns>0].size)
print(f'{np.mean(ns[ns>0]):.2f}')
print(f'{np.sum((ns>=3) & (ns<=4)) / np.sum(ns>0):.2f}')
print(len(pairs))

rho = np.corrcoef(pairs.T)[0,1]

print(f'{rho:.2f}')

plt.figure(figsize=(2, 2), layout='constrained')
plt.scatter(pairs[:,0] + np.random.randn(pairs.shape[0])/5,
            pairs[:,1] + np.random.randn(pairs.shape[0])/5, s=.5, ec='none')

plt.xlabel('Reviewer A')
plt.ylabel('Reviewer B')
plt.axis('equal')
# plt.title(f'All ICLR reviews ({pairs.shape[0]/1000:.0f}k pairs), r={rho:.2f}')

r = rho
k = 4
avcorr = (r * k**2) / (r * k * (k - 1) + k)
print(f'Correlation between the average of {k} reviews if all have rho={r:.2f}: {avcorr:.2f}')

24104
3.66
0.93
244226
0.40


<IPython.core.display.Javascript object>

Correlation between the average of 4 reviews if all have rho=0.40: 0.73
