In [1]:
import copy
from elasticsearch import Elasticsearch, exceptions
import pandas as pd
import warnings

ES_HOST = "https://cluster.elasticsearch.dataesr.ovh/"
ES_INDEX = "bso-publications"
ES_PASSWORD = "vn84q9Xef9U7pmU"
ES_USER = "BSO"

es = Elasticsearch(ES_HOST, http_auth=(ES_USER, ES_PASSWORD))
warnings.simplefilter('ignore', exceptions.ElasticsearchWarning)

### Publications in ES
With foreign publications, publications from HAL ...

In [2]:
count_es = es.count(index=ES_INDEX).get("count")
print(f"{format(count_es, ',')} publications are in ES")

3,043,483 publications are in ES


### Publications in the BSO
Publications from France, with a DOI from crossref and of type "journal-article", "proceedings", "book-chapter", "book" or "preprint"

In [3]:
body_bso = {
	"query": {
		"bool": {
			"must": [
				{
					"term": {
						"bso_country.keyword": "fr"
					}
				},
				{
					"terms": {
						"id_type.keyword": [
							"doi"
						]
					}
				},
				{
					"terms": {
						"genre.keyword": [
							"journal-article",
							"proceedings",
							"book-chapter",
							"book",
							"preprint"
						]
					}
				}
			]
		}
	}
}

In [4]:
count_bso = es.count(index=ES_INDEX, body=body_bso).get("count")
count_bso_percent = count_bso / count_es * 100
print(f"{format(count_bso, ',')} publications are in the BSO, so {count_bso_percent:.2f}% of the ES")

1,608,839 publications are in the BSO, so 52.86% of the ES


### Publications in BSO and downloaded

In [5]:
body_copy = copy.deepcopy(body_bso)
body_copy.get("query").get("bool").get("must").append({ "term": { "bso3_downloaded": "1" } })
count_bso_downloaded = es.count(index=ES_INDEX, body=body_copy).get("count")
count_bso_downloaded_percent = count_bso_downloaded / count_bso * 100
print(f"{format(count_bso_downloaded, ',')} publications in the BSO have been downloaded ie. {count_bso_downloaded_percent:.2f}%")

958,973 publications in the BSO have been downloaded ie. 59.61%


### Publications in the BSO and analyzed by GROBID

In [6]:
body_copy = copy.deepcopy(body_bso)
body_copy.get("query").get("bool").get("must").append({ "term": { "bso3_analyzed_grobid": "1" } })
count_bso_grobid = es.count(index=ES_INDEX, body=body_copy).get("count")
count_bso_grobid_percent = count_bso_grobid / count_bso * 100
print(f"{format(count_bso_grobid, ',')} publications in the BSO have been analyzed by GROBID ie. {count_bso_grobid_percent:.2f}%")

778,569 publications in the BSO have been analyzed by GROBID ie. 48.39%


### Publications in the BSO and analyzed by Datastet

In [7]:
body_copy = copy.deepcopy(body_bso)
body_copy.get("query").get("bool").get("must").append({ "term": { "bso3_analyzed_datastet": "1" } })
count_bso_datastet = es.count(index=ES_INDEX, body=body_copy).get("count")
count_bso_datastet_percent = count_bso_datastet / count_bso * 100
print(f"{format(count_bso_datastet, ',')} publications in the BSO have been analyzed by DataStet ie. {count_bso_datastet_percent:.2f}%")

655,954 publications in the BSO have been analyzed by DataStet ie. 40.77%


### Publications in the BSO and analyzed by Softcite

In [8]:
body_copy = copy.deepcopy(body_bso)
body_copy.get("query").get("bool").get("must").append({ "term": { "bso3_analyzed_softcite": "1" } })
count_bso_softcite = es.count(index=ES_INDEX, body=body_copy).get("count")
count_bso_softcite_percent = count_bso_softcite / count_bso * 100
print(f"{format(count_bso_softcite, ',')} publications in the BSO have been analyzed by Softcite ie. {count_bso_softcite_percent:.2f}%")

777,132 publications in the BSO have been analyzed by Softcite ie. 48.30%


In [9]:
data = []
columns = ["year", "bso", "downloaded", "grobid", "datastet", "softcite"]
for year in range(2013, 2022):
    tmp = [year]
    body_bso_by_year = copy.deepcopy(body_bso)
    body_bso_by_year.get("query").get("bool").get("must").append({ "term": { "year": year } })
    count_bso_by_year = es.count(index=ES_INDEX, body=body_bso_by_year).get("count")
    # Downloaded
    body_bso_by_year_copy = copy.deepcopy(body_bso_by_year)
    body_bso_by_year_copy.get("query").get("bool").get("must").append({ "term": { "bso3_downloaded": "1" } })
    count_bso_by_year_downloaded = es.count(index=ES_INDEX, body=body_bso_by_year_copy).get("count")
    # GROBID
    body_bso_by_year_copy = copy.deepcopy(body_bso_by_year)
    body_bso_by_year_copy.get("query").get("bool").get("must").append({ "term": { "bso3_analyzed_grobid": "1" } })
    count_bso_by_year_grobid = es.count(index=ES_INDEX, body=body_bso_by_year_copy).get("count")
    # DataStet
    body_bso_by_year_copy = copy.deepcopy(body_bso_by_year)
    body_bso_by_year_copy.get("query").get("bool").get("must").append({ "term": { "bso3_analyzed_datastet": "1" } })
    count_bso_by_year_datastet = es.count(index=ES_INDEX, body=body_bso_by_year_copy).get("count")
    # Softcite
    body_bso_by_year_copy = copy.deepcopy(body_bso_by_year)
    body_bso_by_year_copy.get("query").get("bool").get("must").append({ "term": { "bso3_analyzed_softcite": "1" } })
    count_bso_by_year_softcite = es.count(index=ES_INDEX, body=body_bso_by_year_copy).get("count")
    data.append([year, count_bso_by_year, count_bso_by_year_downloaded, count_bso_by_year_grobid, count_bso_by_year_datastet, count_bso_by_year_softcite])
data.append(["total", count_bso, count_bso_downloaded, count_bso_grobid, count_bso_datastet, count_bso_softcite])
df = pd.DataFrame(data, columns=columns)
df

Unnamed: 0,year,bso,downloaded,grobid,datastet,softcite
0,2013,143327,85920,66526,57553,66386
1,2014,147068,88216,66990,58501,66840
2,2015,147129,89674,72617,61568,72513
3,2016,158009,97708,79558,67147,79401
4,2017,160673,103054,83172,70872,83078
5,2018,167089,108948,86822,74674,86634
6,2019,168910,111618,98477,76969,98243
7,2020,176265,119667,105496,82609,105300
8,2021,162763,103323,83705,71105,83559
9,total,1608839,958973,778569,655954,777132


In [10]:
df_percent = df.copy()
for column in ["downloaded", "grobid", "datastet", "softcite"]:
    df_percent[column] = df_percent[column] / df_percent["bso"] * 100
df_percent

Unnamed: 0,year,bso,downloaded,grobid,datastet,softcite
0,2013,143327,59.946835,46.415539,40.15503,46.317861
1,2014,147068,59.983137,45.550358,39.778198,45.448364
2,2015,147129,60.949235,49.356007,41.846271,49.285321
3,2016,158009,61.836984,50.350297,42.495681,50.250935
4,2017,160673,64.138965,51.764764,44.109465,51.706261
5,2018,167089,65.203574,51.961529,44.691153,51.849015
6,2019,168910,66.081345,58.301462,45.568054,58.162927
7,2020,176265,67.890392,59.850793,46.866366,59.739597
8,2021,162763,63.480644,51.427536,43.686219,51.337835
9,total,1608839,59.606524,48.39322,40.771886,48.303901
