In [1]:
import os
import json
import logging
from glob import glob
import pandas as pd
from infrastructure.storage.swift import Swift
from config.harvester_config import config_harvester

# create logger
logger = logging.getLogger()
logger.setLevel(logging.WARNING)
PROCESSED_PUBLICATIONS_DIR = './processed_publications/'

In [2]:
def load_softcite_output_data(processed_files, batch_size=100):
    clean_up()
    partitions = [file for i in range(0,len(processed_files),batch_size) for file in processed_files[i:i+batch_size] if file.endswith('.software.json')]
    mentions = []
    for partition in partitions:
        local_files = download_batch(partition, PROCESSED_PUBLICATIONS_DIR)
        files_content = load_content(local_files)
        mentions.extend(extract_mentions(files_content))
        clean_up()
    return mentions


def clean_up():
    for file in glob(os.path.join(PROCESSED_PUBLICATIONS_DIR, '*.software.json')):
        os.remove(file)

def extract_mentions(files_content):
    mentions = []
    for file_id, json_file in files_content:
        file_mentions = []
        for mention in json_file['mentions']:
            file_mentions.append(mention['software-name']['normalizedForm'])
        mentions.append((file_id, file_mentions))
    return mentions
    
def load_content(local_files):
    files_content = []
    for file in local_files:
        file_id = os.path.basename(file).split('.')[0]
        with open(file, 'r') as f:
            files_content.append((file_id, json.load(f)))
    return files_content

def download_batch(batch, dest_dir):
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    storage_handler.download_files(config_harvester['publications_dump'], batch, dest_path=dest_dir)
    return glob(os.path.join(PROCESSED_PUBLICATIONS_DIR, '*.software.json'))

In [3]:
storage_handler = Swift(config_harvester)

using output SWIFT bso3_publications_dump container: {'count': 26573, 'last_modified': '2022-02-08T15:52:53.709830', 'bytes': 19343955200, 'name': 'bso3_publications_dump'}
using input SWIFT bso_dump container: {'count': 117, 'last_modified': '2022-02-05T17:36:08.026270', 'bytes': 4833008355, 'name': 'bso_dump'}
2022-02-08 17:02:56,329 | infrastructure.storage.swift | DEBUG | container already exists on SWIFT object storage: bso3_publications_dump


In [4]:
processed_files = storage_handler.get_swift_list(config_harvester['publications_dump'], dir_name='processed')
print(f"Il y a {len(processed_files)} fichiers")

Il y a 7297 fichiers


In [5]:
# 7297 fichiers => ~15min
%time mentions = load_softcite_output_data(processed_files)
# %time mentions = load_softcite_output_data(processed_files[:200])

In [6]:
mentions_by_files = pd.DataFrame(mentions).rename(columns={0:'file_id', 1:'mentions'}).set_index('file_id')
mentions_by_files['nb_mentions'] = mentions_by_files['mentions'].apply(len)
mentions_by_files['nb_unique_mentions'] = mentions_by_files['mentions'].apply(set).apply(len)
mentions_by_files.describe()

Unnamed: 0,nb_mentions,nb_unique_mentions
count,7293.0,7293.0
mean,4.147127,1.898396
std,15.118544,3.374254
min,0.0,0.0
25%,0.0,0.0
50%,1.0,1.0
75%,3.0,2.0
max,502.0,37.0


In [10]:
mentions_by_files.head()

Unnamed: 0_level_0,mentions,nb_mentions,nb_unique_mentions
file_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00054b55-3bf8-45a5-9c44-9bdbaf4d60df,"[Cutadapt, fastQC_0, GraphPad Prism, DESeq]",4,4
000849c1-e55e-48e2-ab83-6dfdbd323d5b,"[SPSSv, MedCalc]",2,2
000fe7b5-8930-45e4-8d55-bfbdb37c4165,"[Volocity, DIAS, ImageJ, MATLAB, Photo- shop, ...",12,7
00161c7a-ac7f-47c4-af6c-9bb81e9ec80e,[],0,0
001f0168-f4a3-4ff4-a98a-cf38216e9aec,[],0,0


In [7]:
mentions_count = pd.Series([e for file_id, mentions_list in mentions for e in mentions_list]).value_counts()
print(mentions_count.describe())
mentions_count

count    7709.000000
mean        3.923336
std        13.142169
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max       479.000000
dtype: float64


ImageJ                      479
SAS                         297
SPEDAS                      285
MATLAB                      255
GraphPad Prism              254
                           ... 
MS Office Excel               1
snpEff                        1
vcf-annotate-filter Qual      1
Viewer                        1
GESTE                         1
Length: 7709, dtype: int64

In [8]:
u_mentions_count = pd.Series([e for file_id, mentions_list in mentions for e in set(mentions_list)]).value_counts()
print(u_mentions_count.describe())
u_mentions_count

count    7709.000000
mean        1.795953
std         6.806065
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max       288.000000
dtype: float64


ImageJ                     288
SAS                        235
GraphPad Prism             219
SPSS                       210
Matlab                     139
                          ... 
Surveillance R-package       1
Surveillance R- package      1
NIS-elements AR              1
xspec                        1
GESTE                        1
Length: 7709, dtype: int64