# thesis references

We will fix the references by importing a them from the word document using : https://rintze.zelle.me/ref-extractor/

the resulting bib file is placed in a  (chap_direct_from_site.bib) folder, along with a direct copy of the text from the word document (raw.txt)

We keep another file "chap_to_add.bib", which contains any references the site missed. these get merged into chap.bib, so any changes should be made in one of the two original files.



In [3]:
import re
import pandas as pd
import shutil

base_path = r"D:\surfdrive\Documents\Uni manuscripts\20230109Thesis\Python"
target_path = r"D:\surfdrive\Documents\Uni manuscripts\20230109Thesis"
content_filename = "content.tex"
raw_text_filename = "raw.txt"
bib_direct_filename = "chap_direct_from_site.bib"
bib_to_add_filename = "chap_to_add.bib"
bib_merge_filename = "chapmerge.bib"
bib_override_filename = "chapoverride.bib"
all_bibs = "chap.bib"

folders = [
    r"chapter.1",
    r"chapter.2",
    r"chapter.3",
    r"chapter.4",
    r"chapter.5",
    r"chapter.6",
]

In [4]:
def merge_missed(folder):
    bib_direct = folder + bib_direct_filename
    bib_to_add = folder + bib_to_add_filename

    bib_merge = folder + bib_merge_filename

    f = open(bib_direct, "r", encoding="utf8")
    text_direct=f.read()
    f.close()
    f = open(bib_to_add, "r", encoding="utf8")
    text_to_add=f.read()
    f.close()

    w = open(bib_merge, "w", encoding="utf8")
    w.write(text_direct)
    w.write(text_to_add)
    w.close()

def merge_multiple(paths, target, override):

    w = open(target, "w", encoding="utf8")
    
    # the override is where any refs that need additional fixing go
    f = open(override, "r", encoding="utf8")
    override_text = f.read()
    f.close()
    w.write(override_text)

    for path in paths:
        folder_path = base_path + "\\" + path + "\\" + bib_merge_filename
        f = open(folder_path, "r", encoding="utf8")
        bibs_inner=f.read()
        f.close()
        w.write(bibs_inner)

    w.close()
    
def move_tex(source, target):
    target_tex = target + content_filename
    bu_target_tex = target + content_filename + ".bu"
    source_tex = source + raw_text_filename + ".final"

    shutil.copy(target_tex, bu_target_tex)
    shutil.copy(source_tex, target_tex)

def move_bib(source, target):
    target_bib = target + bib_merge_filename
    bu_target_bib = target + bib_merge_filename + ".bu"
    source_bib = source + bib_merge_filename + ".final"

    shutil.copy(target_bib, bu_target_bib)
    shutil.copy(source_bib, target_bib)

In [9]:
fields_for_type={
    "@article" : [
        "author",
        "title",
        "journal",
        "volume",
        "number",
        "pages",
        "year",
        "issn",
        "doi",
    ],
    "@incollection" : [
        "author",
        "booktitle",
        "title",
        "year",
        "pages",
        "address",
        "edition",
        "isbn",
    ],
}

rules_for_field={
    "author" : [
        (r"[^}{\.,\t A-Za-zÀ-ÖØ-öø-ÿ\-ć’š]", r""),
    ],
    "title" : [],
    "year" : [
        (r"[^\d,\t ]", r""),
    ],
    "pages" : [
        (r"[^}{\-\d,\t ]", r""),
    ],
    "booktitle" : [],
    "address" : [
        (r"[^}{\.,\t A-Za-z\(\)]", r""),
    ],
    "edition" : [
        (r"[^\d,\t ]", r""),
    ],
    "isbn" : [
        (r"[^}{\-\d,\t X]", r""),
    ],
    "journal" : [],
    "volume" : [
        (r"[^\d,\t ]", r""),
    ],
    "number" : [
        (r"[^\d,\t ]", r""),
    ],
    "issn" : [
        (r"[^}{\-\d,\t ]", r""),
    ],
    "doi" : [
        (r".*doi:", r" {"),
        ("[A-Z]+", lambda m: m.group(0).lower()),
    ],
}

known_rules = [
    (r"{\\textless}", "<"),
    (r"{\\textgreater}", ">"),
]

test = """@article{fagerlund2017spacer,
  journal = {Proceedings of the National Academy of Sciences},
  doi     = {https://doi.org/doi:10.18129/B9.bioc.pepXMLTab},
  isbn    = 1215421109,
  issn    = {0027-8424},
  number  = 26,
  pmid    = 28611213,
  title   = {Spacer capture and integration by a type I-F Cas1–Cas2-3 CRISPR adaptation complex},
  volume  = 114,
  author  = {Fagerlund, Robert D. and Wilkinson, Max E. and Klykov, Oleg and Barendregt, Arjan and Pearce, F. Grant and Kieper, Sebastian N. and Maxwell, Howard W. R. and Capolupo, Angela and Heck, Albert J. R. and Krause, Kurt L. and Bostina, Mihnea and Scheltema, Richard A. and Staals, Raymond H. J. and Fineran, Peter C.},
  pages   = {5122--5128},
  date    = 2017,
  year    = 2017
}"""

def clean_bib_entry(bib_entry):
    split = bib_entry.split("\n")
    typ=split[0].split("{")[0]
    to_write = []
    to_write.append(split[0])
    actual_fields = split[1:-1]
    fields = []
    try:
        fields = fields_for_type[typ]
    except:
        to_write = split
    to_write_after = []
    for field in fields:
        for actual_field in actual_fields:
            #print(field)
            #print(actual_field)
            found = False
            if re.findall(field, actual_field):
                actual_fields.remove(actual_field)
                rules = rules_for_field[field]
                rules.extend(known_rules)
                for rule in rules:
                    inner_split = actual_field.split("=")
                    value = inner_split[1]
                    new_value = re.sub(rule[0], rule[1], value)
                    if  value != new_value:
                        if new_value != ',' and new_value != "":
                            to_write_after.append("waarschuwing: changed {} from {}".format(field, value))
                            actual_field = inner_split[0] + "=" + new_value
                        else:
                            to_write_after.append("waarschuwing: fatal: changed {} from {}".format(field, value))
                if actual_field[-1] != ",":
                    actual_field = actual_field + ","

                to_write.append(actual_field)
                found = True
                break
        if not found:
            to_write_after.append("waarschuwing: did not find {}".format(field))
    if split != to_write:
        to_write.append("}\n")

    a = "\n".join(to_write)
    b = "\n".join(to_write_after)
    c = "\n\n"

    return a + b + c

def clean_citation(cit):
    s = cit.lower()
    newCit = re.findall(r"[^\d]+\d{4}[^,]+", s)[0]
    citName = re.findall(r"[^\d]+", newCit)[0]
    citTitle = re.findall(r"\d{4}[^,]+", newCit)[0]
    pats_name = [
        ("^de ",""),
        ("^den ",""),
        ("^van der ",""),
        ("^van de ",""),
        ("^van ",""),
        (r"_",r""),
        (" ",""),
        ]
    pats_title = [
        ("^(\d+)an ","\\1"),
        ("^(\d+)a ","\\1"),
        ("^(\d+)the ","\\1"),
        (r"_",r""),
        ]
    for pat in pats_name:
        citName = re.sub(pat[0], pat[1], citName)
    for pat in pats_title:
        citTitle = re.sub(pat[0], pat[1], citTitle)
    citTitle = re.findall(r"\d{4}[^\d ]+", citTitle)[0]
    newCit = citName + citTitle
    return newCit

def clean_bib(cit):
    s = cit.lower()
    newCit = re.findall(r"[^\d]+\d{4}[^,]+", s)[0]
    citName = re.findall(r"[^\d]+", newCit)[0]
    citTitle = re.findall(r"\d{4}[^,]+", newCit)[0]
    pats_name = [
        ("^de ",""),
        ("^den ",""),
        ("^van der ",""),
        ("^van de ",""),
        ("^van ",""),
        (r"_",r""),
        (" ",""),
        ]
    pats_title = [
        ("^(\d+)an ","\\1"),
        ("^(\d+)a ","\\1"),
        ("^(\d+)the ","\\1"),
        (r"_",r""),
        ]
    for pat in pats_name:
        citName = re.sub(pat[0], pat[1], citName)
    for pat in pats_title:
        citTitle = re.sub(pat[0], pat[1], citTitle)
    citTitle = re.findall(r"\d{4}[^\d ]+", citTitle)[0]
    newCit = citName + citTitle
    return newCit

def clean_raw(raw):
    pats= [
        #refstuff
        (r"\|\|\|", r", "),

        #remove actual comments, then replace percentages and change actual comments back
        (r"%%", r"ikwasneteencomment"),
        (r"%", r"\%"),
        (r"ikwasneteencomment", r"%%"),
        # just escape stuff
        (r"_", r"\_"),
        (r"#", r"\#"),
        (r"&", r"\&"),

        # special chars
        (r"∗", r"$\\star$"),
        (r"∼", r"$\\sim$"),
        (r"~", r"$\\sim$"),

        # header replacements
        ("\[\!", r"ikwasneteenuitroepteken"),
        ("\!\]", r"ikwasneteenuitroeptekenmaardananders"),
        ("\!\!\!\!(.+)", r"\n\\paragraph{\1}"),
        ("\!\!\!(.+)", r"\n\\subsubsection{\1}"),
        ("\!\!(.+)", r"\n\\subsection{\1}"),
        ("\!(.+)", r"\n\\section{\1}"),
        (r"ikwasneteenuitroeptekenmaardananders", r"!]"),
        (r"ikwasneteenuitroepteken", r"[!"),

    ]

    for pat in pats:
        raw = re.sub(pat[0], pat[1], raw)
    return raw

def process_txt(folder):
    txt_mapping={}
    citeList = []
    doneList = []
    path = folder + raw_text_filename
    writeFinal = path + ".final"

    f = open(path, "r", encoding="utf8")
    text=f.read()
    hits = re.findall(r"\\cite{([^\}]*)}", text)
    for hit in hits:
        for it in hit.split("|||"):
            citeList.append(it)
            if "n.d." in it:
                for i in range(0, 10):
                    print("bad citation:")
                    print(it)
                return

    for cit in set(citeList):
        #print
        #print(cit)
        newCit = clean_citation(cit)
        if newCit in doneList:
            print("duplicate: {}".format(newCit))
        else:
            doneList.append(newCit)
            #wL.write(cit + "\n")
        if  newCit in txt_mapping:
            newCit = txt_mapping[newCit]
            print("alt: {}".format(newCit))

        text = text.replace(cit, newCit)
    
    w = open(writeFinal, "w", encoding="utf8")
    w.write(clean_raw(str(text)))
    w.close()

def process_bib(folder):
    idList = []
    
    path = folder + bib_merge_filename
    f = open(path, "r", encoding="utf8")
    text="\n" + f.read()
    f.close()
    writeFinal = path + ".final"
    wL = open(writeFinal, "w", encoding="utf8")
    splits = text.split("\n@")[1:]
    for split in splits:
        identifier = split.split("\n")[0]
        after = re.findall(r"[^\d]+\d+[^ ,\d]+", identifier.split('{')[1][:-1])[0].replace(" ", "").lower()
        if after in idList:
            print("duplicate: {}".format(after))
        else:
            #deal with illegal chars in .bib
            to_write = ("@" + split.replace(identifier.split('{')[1][:-1], after)
            .replace("&", "and")
            .replace("*", "")
            .replace("′", "")
            .replace("$", "")
            .replace(",,", ",")
            .replace(r"_", r"")
            .replace("{\\textbackslash}", "\\")
            + "\n")
            to_write = clean_bib_entry(to_write)
            wL.write(to_write)

            idList.append(after)
    wL.close()


In [14]:
def space_before_citation(text):
    temp_text = text.replace("et al.", "USEDTOBEETAL")
    citation_regex = r'\\cite{[^}]+}'
    regex = r"([\.,]*)[ ]*({})[ ]*([\.,])*[ ]*".format(citation_regex)
    subst = " \\2\\3\\1 "
    temp_text = re.sub(regex, subst, temp_text)
    temp_text = temp_text.replace("USEDTOBEETAL", "et al.")
    # temp_text = temp_text.replace("\n ", "\n")
    # temp_text = temp_text.replace(" \n", "\n")
    badmatches = re.match("{}[^., )]".format(citation_regex), temp_text)
    badmatches2 = re.match("[^ ]{}".format(citation_regex), temp_text)
    if badmatches is not None or badmatches2 is not None:
        print("found at least one strange match")

    return temp_text

def fix_missed_emph(text):
    to_fix = [
        "de novo",
        "de-novo",
        "m/z"
    ]
    
    regex_a = r"(?<!\\emph{)"
    regex_b = "(?<!})"

    for error in to_fix:
        formatted_regex = regex_a + error + regex_b
        formatted_replacement = r"\\emph{" + error + "}"
        text = re.sub(formatted_regex, formatted_replacement, text)

    return text

def fix_panel_references_capitalization(text):
    def regex_convert(m):
        to_return = ""
        to_skip = [0, 3]
        to_lower = [2, 5]
        for i in range(6):
            if i in to_skip:
                continue
            print(i)
            print(m.group(i))
            if m.group(i) is not None:
                to_add = m.group(i)
                if i in to_lower:
                    to_add = to_add.lower()
                to_return = to_return + to_add
        return to_return
    
    regex = r"(\\autoref{[^}]+})([A-Z\-–]+)(( and )?([A-Z])?)"

    return re.sub(regex, lambda m: regex_convert(m), text)

def fix_panel_naming_capitalization(text):
    regex = r"(?<!Oxidation )(?<!constant )(?<!joining )(?<!diversity )(?<!Constant )(?<!Joining )(?<!Diversity )(?<! or)(?<! and)[ \(]([a-mA-M]\)) ?"

    return re.sub(regex, lambda m: " ~~" + m.group(1).lower().strip() + " ", text)

def fix_common_errors(text):
    common_errors = [
        ("\n ", "\n"), # space after enter
        (" \n", "\n"), # space before enter
        ("{ ", "{"), # space after {
        (" }", "}"), # space before }
        ("  ", " "), # double space
    ]

    for error in common_errors:
        regex = error[0]
        replacement = error[1]

        text = re.sub(regex, replacement, text)

    return text

def clean_txt(folder):
    path = folder + raw_text_filename + ".final"
    f = open(path, "r", encoding="utf8")
    text=f.read()
    f.close()

    w = open(path, "w", encoding="utf8")
    text = space_before_citation(text)
    text = fix_missed_emph(text)
    text = fix_panel_references_capitalization(text)
    text = fix_panel_naming_capitalization(text)
    text = fix_common_errors(text)
    w.write(text)
    w.close()

target = base_path + "\\" + bib_merge_filename
override = base_path + "\\" + bib_override_filename

for folder_name in folders:
    folder_path = base_path + "\\" + folder_name + "\\"
    merge_missed(folder_path)


merge_multiple(folders, target, override)
process_bib(base_path + "\\")
move_bib(base_path + "\\", target_path + "\\")

for folder_name in folders:
    folder_path = base_path + "\\" + folder_name + "\\"
    target_path2 = target_path + "\\" + folder_name + "\\"
    process_txt(folder_path)
    clean_txt(folder_path)
    move_tex(folder_path, target_path2)


duplicate: tran2016complete
duplicate: he2018protein
duplicate: roepstorff1984proposal
duplicate: shaw2020direct
duplicate: shaw2018sequencing
duplicate: georgiou2014promise
duplicate: briney2019commonality
duplicate: corti2016protective
duplicate: georgiou2014promise
duplicate: lavinder2014identification
duplicate: lee2016molecular-level
duplicate: lee2019persistent
duplicate: he2019classification
duplicate: he2017analysis
duplicate: mills2015detecting
duplicate: sharpley2019novel
duplicate: wang2019top-down
duplicate: guthals2017de
duplicate: sen2017automated
duplicate: tran2016complete
duplicate: boer2020selectivity
duplicate: fornelli2017top-down
duplicate: shaw2020direct
duplicate: peng2021mass
duplicate: kitaura2017different
duplicate: briney2019commonality
duplicate: schroeder2006similarity
duplicate: bondt2021human
duplicate: bondt2021direct
duplicate: mckinney2010data
duplicate: walt2011numpy
duplicate: virtanen2020scipy
duplicate: hunter2007matplotlib:
duplicate: schroeder201

In [15]:
# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility

import re
regex = r"(?<!Oxidation )(?<!constant )(?<!joining )(?<!diversity )(?<!Constant )(?<!Joining )(?<!Diversity )(?<! or)(?<! and)[ \(]([a-mA-M]\)) ?"

f = open("test.txt", "r", encoding="utf8")
test_str = f.read()
f.close()

# print(test_str)

# subst = r"\\emph{" + error + "}"

# You can manually specify the number of replacements by changing the 4th argument

result = re.sub(regex, lambda m: " ~~" + m.group(1).lower().strip() + " ", test_str)

if result:
    print (result)

# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.

each: Variable (V), Diversity (D), Joining (J), and Constant (C),
our genes each: Variable (V), Diversity (D), Joining (J), and Constant (C), with the light 
lambda (IGL) variable (V) 2-14$\star$01 (IMGT/LIGM-DB: Z73664), IGL joining (J) 2$\star$01 (IMGT/LIGM-DB: M15641), and IGL constant (C) 2$\star$01 (IMGT/LIGM-DB: J00253) alleles. For the heavy-chain Fd portion, we determined that it was constructed from the immunoglobulin heavy (IGH) V3-9$\star$01 (IMGT/LIGM-DB: M99651), IGHJ5$\star$01 (IMGT/LIGM-DB: J00256), and IGHG1$\star$03 (IMGT/LIGM-DB: Y14737) alleles and a diversity (D)
\caption{\textbf{Study Workflow.} ~~a) Human mons. ~~b) SIgA1 waly. ~~c) Illust
\end{figure*}

\caption{
   \textbf{Quantitation with Cross-ID.}  ~~a) Dt alpha).  ~~b) Ligand-free and  ~~c) cAMPbounded structures of bovine alpha type I.  ~~d) 
   
wn in  ~~e) .  ~~d) Pie 
Structuls.
 }

\textbf{Integrative \emph{de novo} s}  ~~a) Datasis.  ~~b) Al.  ~~c) Align

reduction). ~~b) F reduction). ~~c) Fragment. 

folder_name = r"chapter.1"
folder_path = base_path + "\\" + folder_name + "\\"
target_path2 = target_path + "\\" + folder_name + "\\"

mapping_ch1 = {
}

merge_missed(folder_path)
process_txt(folder_path, mapping_ch1)
process_bib(folder_path)
move(folder_path, target_path2)

folder_name = r"chapter.2"
folder_path = base_path + "\\" + folder_name + "\\"
target_path2 = target_path + "\\" + folder_name + "\\"

mapping_ch2 = {
}

merge_missed(folder_path)
process_txt(folder_path, mapping_ch2)
process_bib(folder_path)
move(folder_path, target_path2)

folder_name = r"chapter.3"
folder_path = base_path + "\\" + folder_name + "\\"
target_path2 = target_path + "\\" + folder_name + "\\"

mapping_ch3 = {
}

merge_missed(folder_path)
process_txt(folder_path, mapping_ch3)
process_bib(folder_path)
move(folder_path, target_path2)

folder_name = r"chapter.6"
folder_path = base_path + "\\" + folder_name + "\\"
target_path2 = target_path + "\\" + folder_name + "\\"

mapping_ch6 = {
}

merge_missed(folder_path)
process_txt(folder_path, mapping_ch6)
process_bib(folder_path)
move(folder_path, target_path2)

# utility stuff

dic = {}

for path in bibtexPaths:
    f = open(path, "r", encoding="utf8")
    text=f.read()
    splits = text.split("\n@")[1:]
    for split in splits:
        #print(split)
        identifier = split.split("\n")[0]
        if identifier in dic:
            print(identifier)
        dic[identifier.split("{")[1][:-1]] = split

cit = "Georgiou2014The promise"
s = cit.lower()
newCit = re.findall(r"[^\d]+\d+[^,]+", s)[0]
citName = re.findall(r"[^\d]+", newCit)[0]
citTitle = re.findall(r"\d+.+", newCit)[0]
pats_name = [
    "^de ",
    "^den ",
    "^van ",
    " ",
    ]
pats_title = [
    "^(\d+)a ",
    "^(\d+)the ",
    ]
for pat in pats_name:
    citName = re.sub(pat, "", citName)
for pat in pats_title:
    citTitle = re.sub(pat, "\\1", citTitle)
newCit = citName + citTitle

re.sub("^(\d+)the ", "\\1", citTitle)

clean_citation(cit)

idList = []
for path in bibtexPaths:
    f = open(path, "r", encoding="utf8")
    writeLog = path + ".log"
    wL = open(writeLog, "w", encoding="utf8")
    text=f.read()
    hits = re.findall(r"@.+\n", text)
    for id in hits:
        text = text.replace(id, id.replace(" ", "")
        .replace(",,", ","))
        print(id)

    wL.write(text)
    wL.close()

mapping = {
	"Charles A Janeway2001" : "CharlesAJaneway2001",
	"Dupré2021" : "Dupre2021",
	"Guthals2012" : "Guthals2012Shotgun",
	"Horn2000" : "Horn2000Automated",
	"Liu2014" : "Liu2014De",
	"Lössl2014" : "Lossl2014",
	"Marie Paule Lefranc2003" : "Lefranc2003",
	"Marie-Paule Lefranc2020" : "Lefranc2020",
	"Savidor2017" : "Savidor2017Database-independent",
	"Schroeder Jr.2006" : "SchroederJr.2006",
	"Srzentić2020" : "Srzentic2020",
	"Surjit Singh2018" : "Singh2018",
	"Vaibhav Singh2013" : "Singh2013",
	"de Costa2010" : "DeCosta2010",
	"de Haan2020" : "DeHaan2020",
	"den Boer2020" : "DenBoer2020",
}

done = []

final = path + ".final"
with open(final, 'w', encoding="utf8") as f:
    for identifier in citeList:
        if identifier not in done:
            if identifier in dic:
                f.write('@')
                f.write(dic[identifier])
                f.write("\n\n")
            else:
                if identifier in mapping:
                    f.write('@')
                    f.write(dic[mapping[identifier]].replace(mapping[identifier], identifier.replace(' ', '')))
                    f.write("\n\n")
                else:
                    print("\t\"{}\" : ,".format(identifier))
                
            done.append(identifier)



citeList