In [1]:
import pandas as pd
import gzip
import json
from tqdm import tqdm
from glob import glob

In [2]:
def get_nth_key(dictionary, n=0):
    if n < 0:
        n += len(dictionary)
    for i, key in enumerate(dictionary.keys()):
        if i == n:
            return key
    raise IndexError("dictionary index out of range")

is_ca = lambda oa_details: not oa_details[get_nth_key(oa_details, -1)]['is_oa']

def is_wiley_ca(entry):
        return entry.get("publisher_normalized") == "Wiley"
def is_wiley_oa(entry):
    is_wiley = False
    latest_observation = entry['oa_details'][get_nth_key(entry['oa_details'], -1)]
    for oa_location in latest_observation['oa_locations']:
        if ('url_for_pdf' in oa_location) and oa_location['url_for_pdf']:
                if "wiley" in oa_location["url_for_pdf"]:
                    is_wiley = True
    return is_wiley

def is_elsevier(entry):
    return entry.get("publisher_normalized") == "Elsevier"


def is_hal(entry):
    oa_location = entry['oa_details'][get_nth_key(entry['oa_details'], -1)]
    return oa_location.get("repository_normalized") == "HAL"

def safe_add(obj, inc):
    if not inc.empty:
        try:
            obj += inc.doi.to_list()
        except:
            obj += ['no_doi'] * len(inc.index)

def count_newlines(fname):
    def _make_gen(reader):
        while True:
            b = reader(2 ** 16)
            if not b: break
            yield b

    with open(fname, "rb") as f:
        count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
    return count

def checkpoint(k):
    pd.Series(ca, name='ca').to_csv(f'../tmp/checkpoint_ca_{k}.csv', index=None)
    pd.Series(oa, name='oa').to_csv(f'../tmp/checkpoint_oa_{k}.csv', index=None)
    pd.Series(wiley_ca, name='wiley_ca').to_csv(f'../tmp/checkpoint_wiley_ca_{k}.csv', index=None)
    pd.Series(wiley_oa, name='wiley_oa').to_csv(f'../tmp/checkpoint_wiley_oa_{k}.csv', index=None)
    pd.Series(elsevier_ca, name='elsevier_ca').to_csv(f'../tmp/checkpoint_elsevier_ca_{k}.csv', index=None)

In [3]:
column_names = ["doi", "uuid", "is_harvested", "softcite_version", "grobid_version", "harvester_used", "domain", "url_used", "harvesting_date", "datastet_version"]
db = pd.read_csv('../tmp/pg_dump/harvested_status_table.csv.gz', header=None, names=column_names)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
metadata_files = glob('/Users/antoine.bres/dev/BSO/tmp/bso-publications-latest-*.jsonl')
metadata_files

['/Users/antoine.bres/dev/BSO/tmp/bso-publications-latest-ad.jsonl',
 '/Users/antoine.bres/dev/BSO/tmp/bso-publications-latest-ab.jsonl',
 '/Users/antoine.bres/dev/BSO/tmp/bso-publications-latest-ae.jsonl',
 '/Users/antoine.bres/dev/BSO/tmp/bso-publications-latest-aa.jsonl',
 '/Users/antoine.bres/dev/BSO/tmp/bso-publications-latest-ac.jsonl']

In [5]:
ca = []
oa = []
wiley_ca = []
wiley_oa = []
elsevier_ca = []
elsevier_oa = []
hal = []

for k, metadata_file in enumerate(metadata_files):
    nb_lines = count_newlines(metadata_file)
    print(nb_lines, metadata_file)
    chunk_size= 10_000
    tqdm_total = (nb_lines // chunk_size) + 1
    with pd.read_json(metadata_file, lines=True, chunksize=chunk_size) as reader:
        for i, chunk in enumerate(tqdm(reader, total=tqdm_total)):
            ca_entries = chunk.oa_details.apply(is_ca)

            safe_add(ca, chunk[ca_entries])
            safe_add(oa, chunk[~ca_entries])

            wiley_ca_mask = chunk[ca_entries].apply(is_wiley_ca, axis=1)
            wiley_ca_entries = chunk[chunk.index.isin(wiley_ca_mask.index)][wiley_ca_mask]
            safe_add(wiley_ca, wiley_ca_entries)

            wiley_oa_mask = chunk[~ca_entries].apply(is_wiley_oa, axis=1)
            wiley_oa_entries = chunk[chunk.index.isin(wiley_oa_mask.index)][wiley_oa_mask]
            safe_add(wiley_oa, wiley_oa_entries)

            elsevier_mask = chunk[ca_entries].apply(is_elsevier, axis=1)
            elsevier_entries = chunk[chunk.index.isin(elsevier_mask.index)][elsevier_mask]
            safe_add(elsevier_ca, elsevier_entries)
    
    checkpoint(k)
    ca = []
    oa = []
    wiley_ca = []
    wiley_oa = []
    elsevier_ca = []

716867 /Users/antoine.bres/dev/BSO/tmp/bso-publications-latest-ad.jsonl


100%|██████████| 72/72 [07:52<00:00,  6.56s/it]


716867 /Users/antoine.bres/dev/BSO/tmp/bso-publications-latest-ab.jsonl


100%|██████████| 72/72 [03:33<00:00,  2.97s/it]


2 /Users/antoine.bres/dev/BSO/tmp/bso-publications-latest-ae.jsonl


100%|██████████| 1/1 [00:00<00:00,  2.36it/s]


716867 /Users/antoine.bres/dev/BSO/tmp/bso-publications-latest-aa.jsonl


100%|██████████| 72/72 [03:27<00:00,  2.89s/it]


716867 /Users/antoine.bres/dev/BSO/tmp/bso-publications-latest-ac.jsonl


100%|██████████| 72/72 [07:07<00:00,  5.94s/it]


In [6]:
ca = []
oa = []
wiley_ca = []
wiley_oa = []
elsevier_ca = []
for i in range(len(metadata_files)):
    ca += pd.read_csv(f'../tmp/checkpoint_ca_{i}.csv')['ca'].to_list()
    oa += pd.read_csv(f'../tmp/checkpoint_oa_{i}.csv')['oa'].to_list()
    wiley_ca += pd.read_csv(f'../tmp/checkpoint_wiley_ca_{i}.csv')['wiley_ca'].to_list()
    wiley_oa += pd.read_csv(f'../tmp/checkpoint_wiley_oa_{i}.csv')['wiley_oa'].to_list()
    elsevier_ca += pd.read_csv(f'../tmp/checkpoint_elsevier_ca_{i}.csv')['elsevier_ca'].to_list()

oa = set(oa)
ca = set(ca)
nb_ca = len(ca)
nb_oa = len(oa)
total_nb = nb_ca + nb_oa

% de publi que l'on a réussi à télécharger au global, et en fonction du statut ouvert / fermé
et avec une déclinaison pour Elsevier et Wiley
ca permettrait de valider si on a bien 90% d'elsevier fermé (par ex)

In [7]:
doi_in_db = db.doi.to_list()
oa_in_db = oa & set(doi_in_db)
ca_in_db = ca & set(doi_in_db)

In [8]:
db['oa'] = db.doi.isin(oa_in_db)
db['ca'] = db.doi.isin(ca_in_db)

In [9]:
print(
    f"{total_nb:,} publications in total\n"
    f"{nb_oa:,} publications in open access ({nb_oa/total_nb:.2%})\n"
    f"{nb_ca:,} publications in closed access ({nb_ca/total_nb:.2%})"
)

1,612,793 publications in total
767,589 publications in open access (47.59%)
845,204 publications in closed access (52.41%)


In [10]:
print(
    f"{len(db):,} publications harvested ({len(db)/total_nb:.2%} of {total_nb:,})\n"
    f"{db.oa.sum():,} OA ({db.oa.sum()/nb_oa:.2%} of {nb_oa:,})\n"
    f"{db.ca.sum():,} CA ({db.ca.sum()/nb_ca:.2%} of {nb_ca:,})\n"
    f"({len(db) - db.oa.sum() - db.ca.sum():,} neither OA nor CA according to the file, mostly OA according to the harvester used)"
)

1,055,395 publications harvested (65.44% of 1,612,793)
668,359 OA (87.07% of 767,589)
385,418 CA (45.60% of 845,204)
(1,618 neither OA nor CA according to the file, mostly OA according to the harvester used)


In [11]:
db[~(db.oa | db.ca)].query("harvester_used == 'standard'")['oa'] = True

In [12]:
# hal_in_db = set(db.doi.to_list()) & set(hal)
w_oa_in_db = set(doi_in_db) & set(wiley_oa)
w_ca_in_db = set(doi_in_db) & set(wiley_ca)
e_ca_in_db = set(doi_in_db) & set(elsevier_ca)

In [13]:
print(f"Elsevier: {len(e_ca_in_db):,} in db, {len(e_ca_in_db)/len(set(elsevier_ca)):.2%} of total ({len(set(elsevier_ca)):,})")
print(f"Wiley OA: {len(w_oa_in_db):,} in db, {len(w_oa_in_db)/len(set(wiley_oa)):.2%} of total ({len(set(wiley_oa)):,})")
print(f"Wiley CA: {len(w_ca_in_db):,} in db, {len(w_ca_in_db)/len(set(wiley_ca)):.2%} of total ({len(set(wiley_ca)):,})")

Elsevier: 256,090 in db, 95.13% of total (269,205)
Wiley OA: 26,157 in db, 97.05% of total (26,951)
Wiley CA: 50,711 in db, 86.74% of total (58,464)


In [None]:
df = pd.DataFrame(db.doi)
df['is_harvested'] = True

In [None]:
oa_df = pd.DataFrame(oa, columns=['doi'])
oa_df['is_oa'] = True
ca_df = pd.DataFrame(ca, columns=['doi'])
ca_df['is_oa'] = False
df = oa_df.append(ca_df)
harvested = df.doi.isin(db.doi)
df['is_harvested'] = False
df.loc[harvested, 'is_harvested'] = True

In [None]:
repo_hal = df.doi.isin(hal)
repo_wiley = df.doi.isin(wiley_ca + wiley_oa)
df['repo'] = "NC"
df.loc[repo_hal, 'repo'] = "HAL"
df.loc[repo_wiley, 'repo'] = "Wiley"

In [None]:
agg_df = df.groupby(['is_oa', 'is_harvested', 'repo']).agg('count')
agg_df = agg_df.reset_index()

In [None]:
import seaborn as sns
sns.set_style()

In [None]:
sns.catplot(x="is_harvested", y="doi",
                data=agg_df[['is_oa', 'is_harvested', 'doi']]\
            .groupby(['is_oa', 'is_harvested']).sum().reset_index(),
                kind="bar", ci=None)

In [None]:
g = sns.catplot(x="is_harvested", y="doi", col="repo",
                data=agg_df.query('repo!="NC"'),
                kind="bar", col_order=['Wiley', 'HAL'],  ci=None)
g.set_titles("OA publications repo {col_name}")

In [None]:
g = sns.catplot(x="is_harvested", y="doi", col="is_oa",
                data=agg_df.query('repo=="Wiley"'),
                kind="bar",  ci=None)
g.set_titles("Wiley OA {col_name}")