In [12]:
import copy
from elasticsearch import Elasticsearch, exceptions
import pandas as pd
import warnings

ES_HOST = "https://cluster.elasticsearch.dataesr.ovh/"
ES_INDEX = "bso-publications-20230403"
ES_PASSWORD = "vn84q9Xef9U7pmU"
ES_USER = "BSO"

es = Elasticsearch(ES_HOST, http_auth=(ES_USER, ES_PASSWORD))
warnings.simplefilter('ignore', exceptions.ElasticsearchWarning)

### Publications in ES
With foreign publications, publications from HAL ...

In [2]:
count_es = es.count(index=ES_INDEX).get("count")
print(f"{format(count_es, ',')} publications are in ES")

2,961,498 publications are in ES


### Publications in the BSO
Publications from France, with a DOI from crossref and of type "journal-article", "proceedings", "book-chapter", "book" or "preprint"

In [3]:
body_bso = {
	"query": {
		"bool": {
			"must": [
				{
					"term": {
						"bso_country.keyword": "fr"
					}
				},
				{
					"terms": {
						"id_type.keyword": [
							"doi"
						]
					}
				},
				{
					"terms": {
						"genre.keyword": [
							"journal-article",
							"proceedings",
							"book-chapter",
							"book",
							"preprint"
						]
					}
				}
			]
		}
	}
}

In [4]:
count_bso = es.count(index=ES_INDEX, body=body_bso).get("count")
count_bso_percent = count_bso / count_es * 100
print(f"{format(count_bso, ',')} publications are in the BSO, so {count_bso_percent:.2f}% of the ES")

1,593,783 publications are in the BSO, so 53.82% of the ES


### Publications in BSO and downloaded

In [5]:
body_copy = copy.deepcopy(body_bso)
body_copy.get("query").get("bool").get("must").append({ "term": { "bso3_downloaded": "1" } })
count_bso_downloaded = es.count(index=ES_INDEX, body=body_copy).get("count")
count_bso_downloaded_percent = count_bso_downloaded / count_bso * 100
print(f"{format(count_bso_downloaded, ',')} publications in the BSO have been downloaded ie. {count_bso_downloaded_percent:.2f}%")

959,336 publications in the BSO have been downloaded ie. 60.19%


### Publications in the BSO and analyzed by GROBID

In [8]:
body_copy = copy.deepcopy(body_bso)
body_copy.get("query").get("bool").get("must").append({ "term": { "bso3_analyzed_grobid": "1" } })
count_bso_grobid = es.count(index=ES_INDEX, body=body_copy).get("count")
count_bso_grobid_percent = count_bso_grobid / count_bso * 100
print(f"{format(count_bso_grobid, ',')} publications in the BSO have been analyzed by GROBID ie. {count_bso_grobid_percent:.2f}%")

778,853 publications in the BSO have been analyzed by GROBID ie. 48.87%


### Publications in the BSO and analyzed by Datastet

In [6]:
body_copy = copy.deepcopy(body_bso)
body_copy.get("query").get("bool").get("must").append({ "term": { "bso3_analyzed_datastet": "1" } })
count_bso_datastet = es.count(index=ES_INDEX, body=body_copy).get("count")
count_bso_datastet_percent = count_bso_datastet / count_bso * 100
print(f"{format(count_bso_datastet, ',')} publications in the BSO have been analyzed by DataStet ie. {count_bso_datastet_percent:.2f}%")

656,209 publications in the BSO have been analyzed by DataStet ie. 41.17%


### Publications in the BSO and analyzed by Softcite

In [7]:
body_copy = copy.deepcopy(body_bso)
body_copy.get("query").get("bool").get("must").append({ "term": { "bso3_analyzed_softcite": "1" } })
count_bso_softcite = es.count(index=ES_INDEX, body=body_copy).get("count")
count_bso_softcite_percent = count_bso_softcite / count_bso * 100
print(f"{format(count_bso_softcite, ',')} publications in the BSO have been analyzed by Softcite ie. {count_bso_softcite_percent:.2f}%")

777,414 publications in the BSO have been analyzed by Softcite ie. 48.78%


In [65]:
data = []
columns = ["year", "bso", "downloaded", "grobid", "datastet", "softcite"]
for year in range(2013, 2023):
    tmp = [year]
    body_bso_by_year = copy.deepcopy(body_bso)
    body_bso_by_year.get("query").get("bool").get("must").append({ "term": { "year": year } })
    count_bso_by_year = es.count(index=ES_INDEX, body=body_bso_by_year).get("count")
    # Downloaded
    body_bso_by_year_copy = copy.deepcopy(body_bso_by_year)
    body_bso_by_year_copy.get("query").get("bool").get("must").append({ "term": { "bso3_downloaded": "1" } })
    count_bso_by_year_downloaded = es.count(index=ES_INDEX, body=body_bso_by_year_copy).get("count")
    # GROBID
    body_bso_by_year_copy = copy.deepcopy(body_bso_by_year)
    body_bso_by_year_copy.get("query").get("bool").get("must").append({ "term": { "bso3_analyzed_grobid": "1" } })
    count_bso_by_year_grobid = es.count(index=ES_INDEX, body=body_bso_by_year_copy).get("count")
    # DataStet
    body_bso_by_year_copy = copy.deepcopy(body_bso_by_year)
    body_bso_by_year_copy.get("query").get("bool").get("must").append({ "term": { "bso3_analyzed_datastet": "1" } })
    count_bso_by_year_datastet = es.count(index=ES_INDEX, body=body_bso_by_year_copy).get("count")
    # Softcite
    body_bso_by_year_copy = copy.deepcopy(body_bso_by_year)
    body_bso_by_year_copy.get("query").get("bool").get("must").append({ "term": { "bso3_analyzed_softcite": "1" } })
    count_bso_by_year_softcite = es.count(index=ES_INDEX, body=body_bso_by_year_copy).get("count")
    data.append([year, count_bso_by_year, count_bso_by_year_downloaded, count_bso_by_year_grobid, count_bso_by_year_datastet, count_bso_by_year_softcite])
data.append(["total", count_bso, count_bso_downloaded, count_bso_grobid, count_bso_datastet, count_bso_softcite])
df = pd.DataFrame(data, columns=columns)
df

Unnamed: 0,year,bso,downloaded,grobid,datastet,softcite
0,2013,143095,85914,66520,57546,66380
1,2014,146834,88208,66985,58492,66835
2,2015,146874,89668,72615,61561,72511
3,2016,157978,97913,79728,67285,79570
4,2017,160574,103248,83321,70993,83228
5,2018,166721,109059,86913,74748,86725
6,2019,168433,111703,98550,77043,98315
7,2020,175445,119644,105467,82610,105270
8,2021,160261,103212,83602,71029,83456
9,2022,161378,50409,34907,34664,34879


In [67]:
df_percent = df.copy()
for column in ["downloaded", "grobid", "datastet", "softcite"]:
    df_percent[column] = df_percent[column] / df_percent["bso"] * 100
df_percent

Unnamed: 0,year,bso,downloaded,grobid,datastet,softcite
0,2013,143095,60.039834,46.4866,40.215242,46.388763
1,2014,146834,60.07328,45.619543,39.83546,45.517387
2,2015,146874,61.050969,49.440337,41.914158,49.369528
3,2016,157978,61.978883,50.467787,42.591373,50.367773
4,2017,160574,64.299326,51.889472,44.212014,51.831554
5,2018,166721,65.414075,52.130805,44.834184,52.018042
6,2019,168433,66.318952,58.509912,45.741036,58.370391
7,2020,175445,68.194591,60.113996,47.085981,60.00171
8,2021,160261,64.402444,52.166154,44.320827,52.075053
9,2022,161378,31.2366,21.630582,21.480003,21.613231
