In [16]:
folder = 'Survey/'
source_file = folder + 'Survey.txt'
bib_file = folder + 'Survey.bib'
refs_file = folder + 'SurveyRefs.txt'
target_file = folder + 'SurveyTarget.txt'
numbered_refs_file = folder + 'NumberedZoteroID.txt'

In [2]:
import bibtexparser
import numpy as np
import re

In [3]:
with open(bib_file, encoding="utf-8-sig") as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file)

## author, year, ID, title
print('Number of entries: ' + str(len(bib_database.get_entry_dict())))
bib_url_dict = {}
bib_title_dict = {}
for entry in bib_database.get_entry_list():
    if 'doi' in entry.keys():
        bib_url_dict[entry['doi'].strip()] = entry['ID']
    elif 'url' in entry.keys():
        bib_url_dict[entry['url'].strip()] = entry['ID']
    if 'title' in entry.keys():
        bib_title_dict[entry['title'].replace('{', '').replace('}', '').lower().strip()] = entry['ID']

print('Bib refs with url: ' + str(len(bib_url_dict)))
print('Bib refs with title: ' + str(len(bib_title_dict)))

Number of entries: 209
Bib refs with url: 178
Bib refs with title: 181


In [4]:
# Parse references
class Reference:
    def __init__(self):
        self._number = None
        self._id = None
        self._title = None

    def get_number(self):
        return self._number

    def get_id(self):
        return self._id

    def get_title(self):
        return self._title

    def parse(self, line_str):
        ## Extract number
        point_index = line_str.find('.')
        if point_index:
            self._number = line_str[:point_index].strip()
            line_str = line_str[point_index+1:]

        ## Extract author
        https_index = line_str.find('https')
        if https_index < 0:
            https_index = line_str.find('http')

        if line_str.find('doi') >= 0:
            point_index = line_str.rfind('.')
            self._id = line_str[line_str.rfind('org/')+4:point_index].strip()
        elif https_index >= 0:
            comma_index = line_str.rfind(',')
            self._id = line_str[https_index:comma_index].strip()
        else:
            ## Retrieve title
            double_point_index = line_str.find(':')
            if double_point_index >= 0:
                line_str = line_str[double_point_index+1:]
                point_index = line_str.find('.')
                parenthesis_index = line_str.find('(')
                index = np.min(np.array([point_index, parenthesis_index]))

                if point_index:
                    self._title = line_str[:index].strip().lower()

refs = []
refs_dict = {}
with open(refs_file, encoding="utf-8-sig") as bibtex_file:
    bibtex_str = bibtex_file.readlines()
    for line in bibtex_str:
        ref = Reference()
        ref.parse(line_str=line)
        refs.append(ref)
        refs_dict[ref.get_number()] = ref

print('Num gdoc refs: ' + str(len(refs_dict)))

Num gdoc refs: 136


In [5]:
with open(source_file, encoding="utf-8-sig") as file:
    source_str = file.read()

In [6]:
def find_bib_ref(gdoc_ref):
    id = gdoc_ref.get_id()
    title = gdoc_ref.get_title()
    bib_ref = None

    if id is not None:
        bib_ref = bib_url_dict.get(id)
    elif title is not None:
        for key in bib_title_dict:
            if key.startswith(title[:int(len(title)/2)]):
                bib_ref = bib_title_dict.get(key)
                break

    return bib_ref

In [11]:
ref_numbers = re.finditer(r"\[([A-Za-z0-9_]+)\]", source_str)
replace_dict = {}
target_str = source_str

for number in ref_numbers:
    gdoc_ref_raw = source_str[number.start(0)+1:number.end(0)-1]
    gdoc_ref = refs_dict.get(gdoc_ref_raw)

    if gdoc_ref is not None:
        bib_ref_id = find_bib_ref(gdoc_ref)

        if bib_ref_id is None:
            print('Failed to catch bibref of ' + source_str[number.start(0):number.end(0)])
        else:
            change_str = '\citep{'
            pre_str = source_str[number.start(0)-4:number.start(0)-1]
            if pre_str == 'al.':
                change_str = '\cite{'

            bib_ref_id = change_str + bib_ref_id + '}'
            target_str = target_str.replace(source_str[number.start(0):number.end(0)], bib_ref_id)
    else:
        print('Failed to catch bibref of ' + source_str[number.start(0):number.end(0)])

## Write target in file
with open(target_file, "w", encoding="utf-8-sig") as file:
    file.write(target_str)

Failed to catch bibref of [a]
<__main__.Reference object at 0x000002AEF8DA4430>
<__main__.Reference object at 0x000002AEF8E28D90>
<__main__.Reference object at 0x000002AEF8E28E20>
<__main__.Reference object at 0x000002AEF8E28E50>
<__main__.Reference object at 0x000002AEF8A80F10>
<__main__.Reference object at 0x000002AEF8A80100>
<__main__.Reference object at 0x000002AEF8E28D90>
<__main__.Reference object at 0x000002AEF8A80E20>
<__main__.Reference object at 0x000002AEF8E28E50>
<__main__.Reference object at 0x000002AEF8E42160>
<__main__.Reference object at 0x000002AEF8E421C0>
<__main__.Reference object at 0x000002AEF8E42220>
<__main__.Reference object at 0x000002AEF8E42280>
<__main__.Reference object at 0x000002AEF8E422E0>
<__main__.Reference object at 0x000002AEF8E420A0>
<__main__.Reference object at 0x000002AEF8E42100>
<__main__.Reference object at 0x000002AEF8E422E0>
<__main__.Reference object at 0x000002AEF8E422E0>
<__main__.Reference object at 0x000002AEF8E42400>
<__main__.Reference 

In [18]:
numbered_refs = ""
for i in range(1, len(refs_dict)):
    numbered_refs += str(i) + ": " + find_bib_ref(refs_dict.get(str(i))) + '\n'

with open(numbered_refs_file, "w", encoding="utf-8-sig") as file:
    file.write(numbered_refs)