In [1]:
import os
import json
def load_json(path):
    with open(path, 'r') as f:
        return json.load(f)


In [2]:
data_1 = load_json(r'corpus\papers_with_code_somef\1903.10583v1\felipelouza_bwsd.json')
# corpus\papers_with_code_somef\1903.10583v1\felipelouza_gsa-is_.json

In [4]:
data_2 = load_json(r'../pdf_info_extractor\data_somef\10.1007_978-3-319-68204-4_9\dgarijo_Widoco_.json')

In [5]:
def easy_doi_finder(data:dict):
    try:
        doi = data['result']['doi']
        return doi
    except:
        return None

In [64]:
def text_excerpt_parser(cite_list: list):
    # remove @ and {}
    cite_list = [element.replace('@', '').replace('{', '').replace('}', '') for element in cite_list]
    # strip elements
    cite_list = [element.strip() for element in cite_list]
    # remove empty elements
    cite_list = [element for element in cite_list if element != '']
    # remove final comma
    cite_list = [element[:-1] if element[-1] == ',' else element for element in cite_list]
    
    parsed_dict = {}
    for element in cite_list:
            if element.count('=') == 1:
                try:
                    key, value = element.split('=')
                    key = key.strip()
                    value = value.strip()
                except ValueError:
                    key = element.split('=')[0].strip()
                    value = ''
                parsed_dict[key] = value
    
    return parsed_dict

In [12]:

def cff_parser(cite_list: list):
    '''
    Parse the citation list of a cff file given by somef
    '''
    # replace " by ''
    cite_list = [element.replace('"', '') for element in cite_list]
    # remove empty elements
    cite_list = [element for element in cite_list if element != '']
    
    parsed_dict = {}
    for element in cite_list:
        if element.count(':') > 1:
            key = element.split(':')[0].strip()
            value = ':'.join(element.split(':')[1:]).strip()
            parsed_dict[key] = value
        else:
            try:
                key, value = element.split(':')
                key = key.strip()
                value = value.strip()
            except ValueError:
                key = element.split(':')[0].strip()
                value = ''
            parsed_dict[key] = value
    return parsed_dict

In [13]:
def bibtex_parser(cite_list: list):
    '''
    Parse the citation list of a bibtex ref given by somef
    '''
    # parse first element
    cite_list[0] = cite_list[0].replace('{','=')
    # remove @ and {}
    cite_list = [element.replace('@', '').replace('{', '').replace('}', '') for element in cite_list]
    # strip elements
    cite_list = [element.strip() for element in cite_list]
    # remove empty elements
    cite_list = [element for element in cite_list if element != '']
    # remove final comma
    cite_list = [element[:-1] if element[-1] == ',' else element for element in cite_list]
    parsed_dict = {}
    for element in cite_list:
        if element.count('=') > 1:
            key = element.split('=')[0].strip()
            value = '='.join(element.split('=')[1:])
            parsed_dict[key] = value
        else:
            try:
                key, value = element.split('=')
                key = key.strip()
                value = value.strip()
            except ValueError:
                key = element.split('=')[0].strip()
                value = ''
            parsed_dict[key] = value
    return parsed_dict

In [88]:
data_3 = load_json(r'corpus/papers_with_code_somef\2204.08775v3\JuliaPlots_Plots.jl.json')

In [93]:
def find_doi(somef_data: dict):
    '''
    Find the doi in somef data 
    '''
    try:
        data = somef_data['citation']
    except KeyError:
        return False
        
    for cite in data:
        try:
            if cite['result']['format'] == 'cff':
                cff = cff_parser(cite['result']['value'].split('\n'))
                doi_find = cff['doi'].replace('https://doi.org/','').replace('10.48550/arxiv.','').replace('10.48550/ARXIV.','').replace('/','_')
                return doi_find,1
            elif cite['result']['format'] == 'bibtex':
                bibtex = bibtex_parser(cite['result']['value'].split('\n'))
                doi_find = bibtex['doi'].replace('https://doi.org/','').replace('10.48550/arxiv.','').replace('10.48550/ARXIV.','').replace('/','_')
                return doi_find,2
            elif cite['result']['type'] == 'Text_excerpt':
                text = text_excerpt_parser(cite['result']['value'].split('\n'))
                doi_find = text['doi'].replace('https://doi.org/','').replace('10.48550/arxiv.','').replace('10.48550/ARXIV.','').replace('/','_')
                return doi_find,3
        except KeyError:
            continue
    return False

In [94]:
find_doi(data_3)

('2204.08775', 2)