In [46]:
import os
import json
import bibtexparser
def load_json(path):
    with open(path, 'r') as f:
        return json.load(f)


In [2]:
data_1 = load_json(r'corpus\papers_with_code_somef\1903.10583v1\felipelouza_bwsd.json')
# corpus\papers_with_code_somef\1903.10583v1\felipelouza_gsa-is_.json

In [3]:
data_2 = load_json(r'../pdf_info_extractor\data_somef\10.1007_978-3-319-68204-4_9\dgarijo_Widoco_.json')

In [4]:
def easy_doi_finder(data:dict):
    try:
        doi = data['result']['doi']
        return doi
    except:
        return None

In [5]:
def text_excerpt_parser(cite_list: list):
    # remove @ and {}
    cite_list = [element.replace('@', '').replace('{', '').replace('}', '') for element in cite_list]
    # strip elements
    cite_list = [element.strip() for element in cite_list]
    # remove empty elements
    cite_list = [element for element in cite_list if element != '']
    # remove final comma
    cite_list = [element[:-1] if element[-1] == ',' else element for element in cite_list]
    
    parsed_dict = {}
    for element in cite_list:
            if element.count('=') == 1:
                try:
                    key, value = element.split('=')
                    key = key.strip()
                    value = value.strip()
                except ValueError:
                    key = element.split('=')[0].strip()
                    value = ''
                parsed_dict[key] = value
    
    return parsed_dict

In [6]:

def cff_parser(cite_list: list):
    '''
    Parse the citation list of a cff file given by somef
    '''
    # replace " by ''
    cite_list = [element.replace('"', '') for element in cite_list]
    # remove empty elements
    cite_list = [element for element in cite_list if element != '']
    
    parsed_dict = {}
    for element in cite_list:
        if element.count(':') > 1:
            key = element.split(':')[0].strip()
            value = ':'.join(element.split(':')[1:]).strip()
            parsed_dict[key] = value
        else:
            try:
                key, value = element.split(':')
                key = key.strip()
                value = value.strip()
            except ValueError:
                key = element.split(':')[0].strip()
                value = ''
            parsed_dict[key] = value
    return parsed_dict

In [7]:
def bibtex_parser(cite_list: list):
    '''
    Parse the citation list of a bibtex ref given by somef
    '''
    # parse first element
    cite_list[0] = cite_list[0].replace('{','=')
    # remove @ and {}
    cite_list = [element.replace('@', '').replace('{', '').replace('}', '') for element in cite_list]
    # strip elements
    cite_list = [element.strip() for element in cite_list]
    # remove empty elements
    cite_list = [element for element in cite_list if element != '']
    # remove final comma
    cite_list = [element[:-1] if element[-1] == ',' else element for element in cite_list]
    parsed_dict = {}
    for element in cite_list:
        if element.count('=') > 1:
            key = element.split('=')[0].strip()
            value = '='.join(element.split('=')[1:])
            parsed_dict[key] = value
        else:
            try:
                key, value = element.split('=')
                key = key.strip()
                value = value.strip()
            except ValueError:
                key = element.split('=')[0].strip()
                value = ''
            parsed_dict[key] = value
    return parsed_dict

In [8]:
data_3 = load_json(r'corpus/papers_with_code_somef\2204.08775v3\JuliaPlots_Plots.jl.json')

In [47]:
def find_doi(somef_data: dict):
    '''
    Find the doi in somef data 
    '''
    try:
        data = somef_data['citation']
    except KeyError:
        return False
        
    for cite in data:
        try:
            if cite['result']['format'] == 'cff':
                cff = cff_parser(cite['result']['value'].split('\n'))
                doi_find = cff['doi'].replace('https://doi.org/','').replace('10.48550/arxiv.','').replace('10.48550/ARXIV.','').replace('/','_')
                return doi_find,1
            elif cite['result']['format'] == 'bibtex':
                bibtex = bibtexparser.loads(cite["result"]["value"]).entries[0]
                doi_find = bibtex['doi'].replace('https://doi.org/','').replace('10.48550/arxiv.','').replace('10.48550/ARXIV.','').replace('/','_')
                return doi_find,2
            elif cite['result']['type'] == 'Text_excerpt':
                text = text_excerpt_parser(cite['result']['value'].split('\n'))
                doi_find = text['doi'].replace('https://doi.org/','').replace('10.48550/arxiv.','').replace('10.48550/ARXIV.','').replace('/','_')
                return doi_find,3
        except KeyError:
            continue
    return False

In [51]:
find_doi(data_1)

False

In [49]:
find_doi(data_2)

('10.1007_978-3-319-68204-4_9', 1)

In [50]:
find_doi(data_3)

('2204.08775', 2)

## Análisis bibtex

https://pypi.org/project/bibtexparser/

In [37]:
bibtex_parser(data_3["citation"][0]['result']['value'].split('\n'))

{'misc': 'https://doi.org/10.48550/arxiv.2204.08775',
 'doi': '10.48550/ARXIV.2204.08775',
 'url': 'https://arxiv.org/abs/2204.08775',
 'author': 'Christ, Simon and Schwabeneder, Daniel and Rackauckas, Christopher and Borregaard, Michael Krabbe and Breloff, Thomas',
 'keywords': 'Graphics (cs.GR), FOS: Computer and information sciences, FOS: Computer and information sciences, I.3.3',
 'title': 'Plots.jl -- a user extendable plotting API for the julia programming language',
 'publisher': 'arXiv',
 'year': '2022',
 'copyright': 'Creative Commons Attribution 4.0 International'}

In [29]:
import bibtexparser

In [41]:
bib_dict = bibtexparser.loads(data_3["citation"][0]["result"]["value"]).entries[0]

In [40]:
bib_dict

{'copyright': 'Creative Commons Attribution 4.0 International',
 'year': '2022',
 'publisher': 'arXiv',
 'title': 'Plots.jl -- a user extendable plotting API for the julia programming language',
 'keywords': 'Graphics (cs.GR), FOS: Computer and information sciences, FOS: Computer and information sciences, I.3.3',
 'author': 'Christ, Simon and Schwabeneder, Daniel and Rackauckas, Christopher and Borregaard, Michael Krabbe and Breloff, Thomas',
 'url': 'https://arxiv.org/abs/2204.08775',
 'doi': '10.48550/ARXIV.2204.08775',
 'ENTRYTYPE': 'misc',
 'ID': 'https://doi.org/10.48550/arxiv.2204.08775'}

In [44]:
bibtex_parser(data_2["citation"][1]['result']['value'].split('\n'))

{'inproceedings': 'garijo2017widoco',
 'title': 'WIDOCO: a wizard for documenting ontologies',
 'author': 'Garijo, Daniel',
 'booktitle': 'International Semantic Web Conference',
 'pages': '94--102',
 'year': '2017',
 'organization': 'Springer, Cham',
 'doi': '10.1007/978-3-319-68204-4_9',
 'funding': 'USNSF ICER-1541029, NIH 1R01GM117097-01',
 'url': 'http://dgarijo.com/papers/widoco-iswc2017.pdf'}

In [45]:
bib_dict = bibtexparser.loads(data_2["citation"][1]["result"]["value"]).entries[0]
bib_dict

{'url': 'http://dgarijo.com/papers/widoco-iswc2017.pdf',
 'funding': 'USNSF ICER-1541029, NIH 1R01GM117097-01',
 'doi': '10.1007/978-3-319-68204-4_9',
 'organization': 'Springer, Cham',
 'year': '2017',
 'pages': '94--102',
 'booktitle': 'International Semantic Web Conference',
 'author': 'Garijo, Daniel',
 'title': 'WIDOCO: a wizard for documenting ontologies',
 'ENTRYTYPE': 'inproceedings',
 'ID': 'garijo2017widoco'}