In [1]:
import copy
from elasticsearch import Elasticsearch, exceptions
import warnings

ES_HOST = "https://cluster.elasticsearch.dataesr.ovh/"
ES_INDEX = "bso-publications-20230403"
ES_PASSWORD = "vn84q9Xef9U7pmU"
ES_USER = "BSO"

es = Elasticsearch(ES_HOST, http_auth=(ES_USER, ES_PASSWORD))
warnings.simplefilter('ignore', exceptions.ElasticsearchWarning)

### Publications in ES
With foreign publications, publications from HAL ...

In [2]:
count_es = es.count(index=ES_INDEX).get("count")
print(f"{format(count_es, ',')} publications are in ES")

2,961,498 publications are in ES


### Publications in the BSO
Publications from France, with a DOI from crossref and of type "journal-article", "proceedings", "book-chapter", "book" or "preprint"

In [3]:
body_bso = {
	"query": {
		"bool": {
			"must": [
				{
					"term": {
						"bso_country.keyword": "fr"
					}
				},
				{
					"terms": {
						"id_type.keyword": [
							"doi"
						]
					}
				},
				{
					"terms": {
						"genre.keyword": [
							"journal-article",
							"proceedings",
							"book-chapter",
							"book",
							"preprint"
						]
					}
				}
			]
		}
	}
}

In [4]:
count_bso = es.count(index=ES_INDEX, body=body_bso).get("count")
count_bso_percent = count_bso / count_es * 100
print(f"{format(count_bso, ',')} publications are in the BSO, so {count_bso_percent:.2f}% of the ES")

1,593,783 publications are in the BSO, so 53.82% of the ES


### Publications in BSO and downloaded

In [5]:
body_copy = copy.deepcopy(body_bso)
body_copy.get("query").get("bool").get("must").append({ "term": { "bso3_downloaded": "1" } })
count_bso_downloaded = es.count(index=ES_INDEX, body=body_copy).get("count")
count_bso_downloaded_percent = count_bso_downloaded / count_bso * 100
print(f"{format(count_bso_downloaded, ',')} publications in the BSO have been downloaded ie. {count_bso_downloaded_percent:.2f}%")

959,336 publications in the BSO have been downloaded ie. 60.19%


### Publications in the BSO and analyzed by Datastet

In [6]:
body_copy = copy.deepcopy(body_bso)
body_copy.get("query").get("bool").get("must").append({ "term": { "bso3_analyzed_datastet": "1" } })
count_bso_datastet = es.count(index=ES_INDEX, body=body_copy).get("count")
count_bso_datastet_percent = count_bso_datastet / count_bso * 100
print(f"{format(count_bso_datastet, ',')} publications in the BSO have been analyzed by DataStet ie. {count_bso_datastet_percent:.2f}%")

656,209 publications in the BSO have been analyzed by DataStet ie. 41.17%


### Publications in the BSO and analyzed by Softcite

In [7]:
body_copy = copy.deepcopy(body_bso)
body_copy.get("query").get("bool").get("must").append({ "term": { "bso3_analyzed_softcite": "1" } })
count_bso_softcite = es.count(index=ES_INDEX, body=body_copy).get("count")
count_bso_softcite_percent = count_bso_softcite / count_bso * 100
print(f"{format(count_bso_softcite, ',')} publications in the BSO have been analyzed by Softcite ie. {count_bso_softcite_percent:.2f}%")

777,414 publications in the BSO have been analyzed by Softcite ie. 48.78%
