In [1]:
import pandas as pd
import subprocess
import re

In [2]:
# Load ground truth data
df = pd.read_json(r'json/pubmed22n0001.xml.gzParsedAllFields.json')

In [3]:
# Overview
df.head()

Unnamed: 0,Title,Author,PubMedIDs,JournalName,PageNumber,Volume,Year,Issue,ama,apa,mla,nlm
0,Formate assay in body fluids: application in m...,"[A B Makar, K E McMartin, M Palese, T R Tephly]",1,Biochemical medicine,117-26,13,1975,2,"Makar AB, McMartin KE, Palese M, Tephly TR. Fo...","Makar, A. B., McMartin, K. E., Palese, M., & T...","Makar, A B et al. “Formate assay in body fluid...","Makar AB, McMartin KE, Palese M, Tephly TR. Fo..."
1,Delineation of the intimate details of the bac...,"[K S Bose, R H Sarma]",2,Biochemical and biophysical research communica...,1173-9,66,1975,4,"Bose KS, Sarma RH. Delineation of the intimate...","Bose, K. S., & Sarma, R. H. (1975). Delineatio...","Bose, K S, and R H Sarma. “Delineation of the ...","Bose KS, Sarma RH. Delineation of the intimate..."
2,Metal substitutions incarbonic anhydrase: a ha...,"[R J Smith, R G Bryant]",3,Biochemical and biophysical research communica...,1281-6,66,1975,4,"Smith RJ, Bryant RG. Metal substitutions incar...","Smith, R. J., & Bryant, R. G. (1975). Metal su...","Smith, R J, and R G Bryant. “Metal substitutio...","Smith RJ, Bryant RG. Metal substitutions incar..."
3,Effect of chloroquine on cultured fibroblasts:...,"[U N Wiesmann, S DiDonato, N N Herschkowitz]",4,Biochemical and biophysical research communica...,1338-43,66,1975,4,"Wiesmann UN, DiDonato S, Herschkowitz NN. Effe...","Wiesmann, U. N., DiDonato, S., & Herschkowitz,...","Wiesmann, U N et al. “Effect of chloroquine on...","Wiesmann UN, DiDonato S, Herschkowitz NN. Effe..."
4,Atomic models for the polypeptide backbones of...,"[W A Hendrickson, K B Ward]",5,Biochemical and biophysical research communica...,1349-56,66,1975,4,"Hendrickson WA, Ward KB. Atomic models for the...","Hendrickson, W. A., & Ward, K. B. (1975). Atom...","Hendrickson, W A, and K B Ward. “Atomic models...","Hendrickson WA, Ward KB. Atomic models for the..."


In [4]:
import sys
sys.path.append('xnparser_sp22/src')

In [5]:
from xnparser import XNParser

In [6]:
parser = XNParser()

In [7]:
# THIS IS THE PROCESS OF GETTING THE PARSED REFERENCE DICTIONARY

def parseReference(rawRefString, quiet):
    # Run the parser and get the resulting output
    parser.parseRef(rawRefString)
    refDict = parser.getResult()
    
    if quiet is False:
        print("Parsed reference before edits in parseReference: ")
        try:
            print(refDict['issue'])
            print()
        except:
            pass
    
    # JOURNAL NAME
    try:
        # If lingering quotes from a title
        if '”' in refDict['journal']:
            refList = refDict['journal'].split('”')
            refDict['journal'] = refList[-1]
        # If lingering comma from title
        if ',' in refDict['journal']:
            refList = refDict['journal'].split(',')
            refDict['journal'] = refList[-1]
        # Remove the punctuation
        # refDict['journal'] = re.sub(r'[^\w\s]', '', refDict['journal'])
        # Remove whitespace on either side
        refDict['journal'] = refDict['journal'].strip()
        # If string ends with "vol"
        if refDict['journal'].endswith("vol"):
            refDict['journal'] = refDict['journal'][:-3]
        # Remove whitespace on either side
        refDict['journal'] = refDict['journal'].strip()
        if '.' in refDict['journal']:
            refList = refDict['journal'].split('.')
            refDict['journal'] = refList[-1].strip()
    except:
        pass

    # ISSUE
    # If there are multiple issue numbers, use the first one
    try:
        refDict['issues'] = refDict['issues'].split(',')
        refDict['issues'] = refDict['issues'][0]
    except:
        pass
    
    try:
        if refDict['issues'][-1] == ")":
            refDict['issues'] = refDict['issues'][0:-1]
    except:
        pass
    
    # VOLUME
    # If there are multiple volume number, use the first one
    try:
        refDict['volume'] = refDict['volume'].split(',')
        refDict['volume'] = refDict['volume'][0]
    except:
        pass

    # PAGES
    try:
        # If multiple pages, use the first one
        pageList = refDict['pages'].split(',')
        if len(pageList) > 1:
            refDict['pages'] = pageList[0]
        refDict['pages'] = refDict['pages'].replace("--", "-")
    except:
        pass
    try:
        
        pageList = refDict['pages'].split('-')
        counter = 0
        if "P" not in pageList[0] and "P" not in pageList[1]:
            if len(pageList[0]) == len(pageList[1]):
                for i in range(len(pageList[0])):
                    if pageList[0][i] != pageList[1][i]:
                        counter = i
                        break
                pageList[1] = pageList[1][counter:]
                refDict['pages'] = pageList[0] + "-" + pageList[1]
    except:
        pass
    


    # TITLE
    # The ground truth dataset has a period after the title
    try:
        # Find string between brackets. If there is one, that's the title
        s = refDict['title']
        result = re.findall('\[.*?\]', s)
        if result:
            refDict['title'] = result[0] + "."
        else:
            if refDict['title'][-1] != ".":
                refDict['title'] = refDict['title'] + "."
            if refDict['title'][0] == '“':
                refDict['title'] = refDict['title'][1:]
            if '”' in refDict['title']:
                refList = refDict['title'].split('”')
                if len(refList) > 1:
                    if refList[0] == "":
                        refDict['title'] = refList[1].lstrip()
                    else:
                        refDict['title'] = refList[0]
            if "[" in refDict['title']:
                refList = refDict['title'].split('[')
                refDict['title'] = refList[0].strip() + "."
        refDict['title'] = refDict['title'].strip()
    except:
        pass
    
#     #AUTHOR
#     try:

#         if "et al." in rawRefString:
#             refList = refDict['author'].split(', ')
#             refDict['author'] = [refList[1] + " " + refList[0]]
#         if "“" in refDict['author']:
#             refDict['author'] = refDict['author'].replace("“", "")
#             refDict['author'] = refDict['author'].strip()
            
            
            
#         # If there is a comma, we assume there are multiple authors
#         if ',' in refDict['author']:
#             refDict['author'] = refDict['author'].split(', ')
#             abrevList = []
#             nameList = []
#             for i in range(1, len(refDict['author']), 2):
#                 abrevList.append(refDict['author'][i])
#             for i in range(0, len(refDict['author']), 2):
#                 nameList.append(refDict['author'][i])
#             if 'IV' in nameList:
#                 nameList.remove('IV')
#             formattedAuthorList = []
#             try:
#                 nameList.remove("-")
#             except:
#                 pass
#             try:
#                 abrevList.remove("-")
#             except:
#                 pass
#             for i in range(len(abrevList)):
#                 if len(abrevList[i]) == 2:
#                     abrevList[i] = " ".join(abrevList[i])
#             for i in range(len(nameList)):
#                 try:
#                     theString = ""
#                     for j in range(len(abrevList[i])):
#                         theString = theString + abrevList[i][j]
#                     theString += " " + nameList[i]
#                 except:
#                     pass
#                 formattedAuthorList.append(theString)
#             for i in range(len(formattedAuthorList)):
#                 formattedAuthorList[i] = formattedAuthorList[i].replace(".", "")
#             refDict['author'] = formattedAuthorList
        
            
#     except:
#         pass
    
    #YEAR
    try:
        if ',' in refDict['year']:
            refList = refDict['year'].split(',')
            refList = [int(''.join(filter(str.isdigit, x))) for x in refList]
            refDict['year'] = max(refList)
    except:
        pass


    return refDict

    

In [8]:
# THIS IS THE PROCESS OF COMPARING THE PARSED REFERENCE DICTIONARY TO THE GROUND TRUTH IN THE DF "called df"

# TESTING JUST ONE ARTICLE / MLA STYLE

def testParse(paperIndex):

    # Get raw reference string from the loaded df which has both ground truth and raw ref
    rawRefString = df.loc[paperIndex, 'mla']
    
    # Get parsed reference as a dictionary
    refDict = parseReference(rawRefString, False)
    
    #groundTruthDict = {}

    # Set up test results df
    testResultsDF = pd.DataFrame(columns = ['pubMedID',
                                            'title',
                                            'author',
                                            'journalName',
                                            'pageNumber',
                                            'volume',
                                            'year',
                                            'issue'])

    titleTruth = 0
    authorTruth = 0
    journalTruth = 0
    pageTruth = 0
    volumeTruth = 0
    yearTruth = 0
    issueTruth = 0

    # TITLE
    try:
        if str(df.loc[paperIndex, 'Title']) == "TitleNotFound":
            titleTruth = 0.5
    except:
        pass
    try:
        #groundTruthDict['Title'] = df.loc[paperIndex, 'Title']
        if str(refDict['title']) == str(df.loc[paperIndex, 'Title']):
            titleTruth = 1
            
        parsed = sorted(refDict['title'])
        parsed = [x for x in parsed if x != ' ']
        parsed = [x for x in parsed if x != '-']
        parsed = [x for x in parsed if x != '.']
        parsed = [x for x in parsed if x != '[']
        parsed = [x for x in parsed if x != ']']
        #print(parsed)
        truth = sorted(df.loc[paperIndex, 'Title'])
        truth = [x for x in truth if x != ' ']
        truth = [x for x in truth if x != '-']
        truth = [x for x in truth if x != '.']
        truth = [x for x in truth if x != '[']
        truth = [x for x in truth if x != ']']
        #print(truth)
        if parsed == truth:
            titleTruth = 1
        
    except:
        pass
    
    # AUTHOR
    try:
        if str(df.loc[paperIndex, 'Author']) == "AuthorNotFound":
            authorTruth = 0.5
    except:
        pass
    try:
        if not df.loc[paperIndex, 'Author']:
            authorTruth = 0.5
    except:
        pass
    try:
        parsed = sorted(refDict['author'])
        truth = sorted("".join(df._get_value(paperIndex, "Author")))
        parsed = [x for x in parsed if x != ' ']
        parsed = [x for x in parsed if x != '-']
        parsed = [x for x in parsed if x != '.']
        parsed = [x for x in parsed if x != ',']
        truth = [x for x in truth if x != ' ']
        truth = [x for x in truth if x != '-']
        truth = [x for x in truth if x != '.']
        truth = [x for x in truth if x != ',']
        if parsed == truth:
            authorTruth = 1
            
        result = refDict['author'].split()
        result = [x.replace(".", "") for x in result]
        theString = result[1] + " " + result[2] + " " + result[0]
        parsed = sorted("".join(result))
        truth = sorted(theString)
        if parsed == truth:
            authorTruth = 1
        if theString == df._get_value(paperIndex, "Author")[0]:
            authorTruth = 1
    except:
        pass
    try:
        #groundTruthDict['Author'] = df.loc[paperIndex, 'Author']
        # Here we are comparing lists, so there's no need to cast to str
        if refDict['author'] == df.loc[paperIndex, 'Author']:
            authorTruth = 1

        try:
            if refDict['author'][0] == df.loc[paperIndex, 'Author'][0]:
                authorTruth = 1
        except:
            pass
    except:
        pass
    try:
        result = refDict['author'].split()
        result = [x.replace(".", "") for x in result]
        theString = result[1] + " " + result[0]
        if theString == df.loc[paperIndex, "Author"][0]:
            authorTruth = 1
    except:
        pass
    try:
        a = sorted(refDict['author'][0].replace(" ", ""))
        b = sorted(df.loc[paperIndex, 'Author'][0].replace(" ", ""))
        if a == b:
            authorTruth = 1
    except:
        pass
    try:
        if 'et al' in rawRefString:
            parsed = sorted(refDict['author'])
            truth = sorted(df._get_value(paperIndex, "Author")[0])
            parsed = [x for x in parsed if x != ' ']
            parsed = [x for x in parsed if x != '-']
            parsed = [x for x in parsed if x != '.']
            parsed = [x for x in parsed if x != ',']
            truth = [x for x in truth if x != ' ']
            truth = [x for x in truth if x != '-']
            truth = [x for x in truth if x != '.']
            truth = [x for x in truth if x != ',']
            if parsed == truth:
                authorTruth = 1
    except:
        pass
            

    # JOURNAL
    # One issue here is that the parsed output will be a journal name that is
    # in an abbreviated/truncated form. However, it's still clearly the same
    # journal name! This solution proposes that if we split up the ground truth
    # journal name and the parsed journal name (potentially truncated) into lists
    # where each element is a word, then if each word in the parsed name is a
    # substring of it's corresponding word in the ground truth name, we say
    # the parsed journal name is accurate. 
    try:
        if str(df.loc[paperIndex, 'JournalName']) == "JournalNotFound":
            journalTruth = 0.5
    except:
        pass
    try:
        if "." in df.loc[paperIndex, 'JournalName']:
            result = df.loc[paperIndex, 'JournalName'].split(".")
            if result[1].strip() == refDict['journal']:
                journalTruth = 1
    except:
        pass
    try:
        #groundTruthDict['JournalName'] = df.loc[paperIndex, 'JournalName']
        # First check the simple way
        if str(refDict['journal']) == str(df.loc[paperIndex, 'JournalName']):
            journalTruth = 1       
        else:    
            # Then check the truncated way
            truthList = df.loc[paperIndex, 'JournalName'].split()
            if 'and' in truthList:
                truthList.remove('and')
            if 'of' in truthList:
                truthList.remove('of')
            if 'The' in truthList:
                truthList.remove('The')
            parsedList = refDict['journal'].split()

            truncatedSituation = False
            if len(truthList) == len(parsedList):
                for i in range(len(truthList)):
                    if len(truthList[i]) != len(parsedList[i]):
                        truncatedSituation = True
                        break
            if truncatedSituation:
                #print('truncation')
                validTruncation = True
                for i in range(len(truthList)):
                    if parsedList[i].lower() not in truthList[i].lower():
                        #print("bad truncation")
                        #print("parsedList at i: ", parsedList[i])
                        #print("truthList at i: ", truthList[i])

                        validTruncation = False
                if validTruncation:
                    journalTruth = 1
        # Account for the situtation in which there is a year in the journal name
        s = str(df.loc[paperIndex, 'JournalName'])
        result = s[:s.find('(')].strip()
        if str(refDict['journal']) == result:
            journalTruth = 1
    except:
        pass
    try:
        parsed = sorted(refDict['journal'])
        truth = sorted(df._get_value(paperIndex, "JournalName"))
        parsed = [x for x in parsed if x != ' ']
        parsed = [x for x in parsed if x != '-']
        parsed = [x for x in parsed if x != '.']
        parsed = [x for x in parsed if x != ',']
        parsed = [x.lower() for x in parsed]
        parsed = sorted("".join(parsed))
        truth = [x for x in truth if x != ' ']
        truth = [x for x in truth if x != '-']
        truth = [x for x in truth if x != '.']
        truth = [x for x in truth if x != ',']
        truth = [x.lower() for x in truth]
        truth = sorted("".join(truth))
        if parsed == truth:
            journalTruth = 1
    except:
        pass

    # PAGES
    try:
        if str(df.loc[paperIndex, 'Page']) == "PageNotFound":
            pageTruth = 0.5
    except:
        pass
    try:
        #groundTruthDict['PageNumber'] = df.loc[paperIndex, 'PageNumber']
        if str(refDict['pages']) == str(df.loc[paperIndex, 'PageNumber']):
            pageTruth = 1
    except:
        pass
    
    # VOLUME
    try:
        if str(df.loc[paperIndex, 'Volume']) == "VolumeNotFound":
            volumeTruth = 0.5
    except:
        pass
    try:
        #groundTruthDict['Volume'] = df.loc[paperIndex, 'Volume']
        if str(refDict['volume']) == str(df.loc[paperIndex, 'Volume']):
            volumeTruth = 1
    except:
        pass
    
    # YEAR
    try:
        if str(df.loc[paperIndex, 'Year']) == "YearNotFound":
            yearTruth = 0.5
    except:
        pass
        
    try:
        #groundTruthDict['Year'] = df.loc[paperIndex, 'Year']
        if str(refDict['year']) == str(df.loc[paperIndex, 'Year']):
            yearTruth = 1
    except:
        pass
    
    # ISSUE
    try:
        if str(df.loc[paperIndex, 'Issue']) == "IssueNotFound":
            issueTruth = 0.5
    except:
        pass
    try:
        if str(refDict['issue'].strip()) == str(df._get_value(paperIndex, 'Issue').strip()):
            issueTruth = 1
    except:
        pass
    
    try:
        parsed = str(refDict['issue'])
        parsed = parsed.replace("(", "")
        parsed = parsed.replace(")", "")
        truth = str(df._get_value(paperIndex, 'Issue'))
        if parsed == truth:
            issueTruth = 1
    except:
        pass
    
    df2 = {'pubMedID': df.loc[paperIndex, 'PubMedIDs'],
           'title': titleTruth,'author': authorTruth,
          'journalName': journalTruth,
          'pageNumber': pageTruth,
          'volume': volumeTruth,
          'year': yearTruth,
          'issue': issueTruth}

    testResultsDF = testResultsDF.append(df2, ignore_index = True)
    
    print("Parsed reference dictionary (after edits): ")
    try:
        print(refDict['issue'])
        print()
    except:
        pass
    print("Raw referenece: ")
    print(rawRefString)
    print()

    print("Ground truth: ")
    print(df.loc[paperIndex, "Issue"])
    print()
    
    return testResultsDF


In [9]:
testParse(0)

Parsed reference before edits in parseReference: 
2 

Parsed reference dictionary (after edits): 
2 

Raw referenece: 
Makar, A B et al. “Formate assay in body fluids: application in methanol poisoning.” Biochemical medicine vol. 13,2 (1975): 117-26. doi:10.1016/0006-2944(75)90147-7

Ground truth: 
2



Unnamed: 0,pubMedID,title,author,journalName,pageNumber,volume,year,issue
0,1,1,1,1,1,1,1,1


In [10]:
for i in errorList:
    testParse(i)
    print()

NameError: name 'errorList' is not defined

In [11]:
# TESTING ALL ARTICLES IN THE JSON

def bigTestParse(style: str):
    # Set up test results df
    testResultsDF = pd.DataFrame(columns = ['pubMedID',
                                            'title',
                                            'author',
                                            'journalName',
                                            'pageNumber',
                                            'volume',
                                            'year',
                                            'issue'])
    fullLength = len(df)

    for paperIndex in range(len(df)):
        if paperIndex % 10 == 0:
            print(round((paperIndex/fullLength)*100, 2), " percent done")

        # Get raw reference string from the loaded df which has both ground truth and raw ref
        rawRefString = df._get_value(paperIndex, style)

        # Get parsed reference as a dictionary
        refDict = parseReference(rawRefString, True)

        groundTruthDict = {}

        titleTruth = 0
        authorTruth = 0
        journalTruth = 0
        pageTruth = 0
        volumeTruth = 0
        yearTruth = 0
        issueTruth = 0

        # TITLE
        try:
            if str(df._get_value(paperIndex, "Title")) == "TitleNotFound":
                titleTruth = 0.5
        except:
            pass
        try:
            #groundTruthDict['Title'] = df.loc[paperIndex, 'Title']
            if str(refDict['title']) == str(df._get_value(paperIndex, "Title")):
                titleTruth = 1

            parsed = sorted(refDict['title'])
            parsed = [x for x in parsed if x != ' ']
            parsed = [x for x in parsed if x != '-']
            parsed = [x for x in parsed if x != '.']
            parsed = [x for x in parsed if x != '[']
            parsed = [x for x in parsed if x != ']']
            #print(parsed)
            truth = sorted(df.loc[paperIndex, 'Title'])
            truth = [x for x in truth if x != ' ']
            truth = [x for x in truth if x != '-']
            truth = [x for x in truth if x != '.']
            truth = [x for x in truth if x != '[']
            truth = [x for x in truth if x != ']']
            #print(truth)
            if parsed == truth:
                titleTruth = 1
        except:
            pass

        # AUTHOR
        try:
            if str(df._get_value(paperIndex, "Author")) == "AuthorNotFound":
                authorTruth = 0.5
        except:
            pass
        try:
            if not df._get_value(paperIndex, "Author"):
                authorTruth = 0.5
        except:
            pass
        try:
            parsed = sorted(refDict['author'])
            truth = sorted("".join(df._get_value(paperIndex, "Author")))
            parsed = [x for x in parsed if x != ' ']
            parsed = [x for x in parsed if x != '-']
            parsed = [x for x in parsed if x != '.']
            parsed = [x for x in parsed if x != ',']
            truth = [x for x in truth if x != ' ']
            truth = [x for x in truth if x != '-']
            truth = [x for x in truth if x != '.']
            truth = [x for x in truth if x != ',']
            if parsed == truth:
                authorTruth = 1

            result = refDict['author'].split()
            result = [x.replace(".", "") for x in result]
            theString = result[1] + " " + result[2] + " " + result[0]
            parsed = sorted("".join(result))
            truth = sorted(theString)
            if parsed == truth:
                authorTruth = 1
            if theString == df._get_value(paperIndex, "Author")[0]:
                authorTruth = 1
        except:
            pass
        try:
            #groundTruthDict['Author'] = df.loc[paperIndex, 'Author']
            # Here we are comparing lists, so there's no need to cast to str
            if refDict['author'] == df._get_value(paperIndex, "Author"):
                authorTruth = 1

            try:
                if refDict['author'][0] == df._get_value(paperIndex, "Author")[0]:
                    authorTruth = 1
            except:
                pass
        except:
            pass
        try:
            result = refDict['author'].split()
            result = [x.replace(".", "") for x in result]
            theString = result[1] + " " + result[0]
            if theString == df._get_value(paperIndex, "Author")[0]:
                authorTruth = 1
        except:
            pass
        try:
            a = sorted(refDict['author'][0].replace(" ", ""))
            b = sorted(df._get_value(paperIndex, "Author")[0].replace(" ", ""))
            if a == b:
                authorTruth = 1
        except:
            pass
        try:
            if 'et al' in rawRefString:
                parsed = sorted(refDict['author'])
                truth = sorted(df._get_value(paperIndex, "Author")[0])
                parsed = [x for x in parsed if x != ' ']
                parsed = [x for x in parsed if x != '-']
                parsed = [x for x in parsed if x != '.']
                parsed = [x for x in parsed if x != ',']
                truth = [x for x in truth if x != ' ']
                truth = [x for x in truth if x != '-']
                truth = [x for x in truth if x != '.']
                truth = [x for x in truth if x != ',']
                if parsed == truth:
                    authorTruth = 1
        except:
            pass

        # JOURNAL
        # One issue here is that the parsed output will be a journal name that is
        # in an abbreviated/truncated form. However, it's still clearly the same
        # journal name! This solution proposes that if we split up the ground truth
        # journal name and the parsed journal name (potentially truncated) into lists
        # where each element is a word, then if each word in the parsed name is a
        # substring of it's corresponding word in the ground truth name, we say
        # the parsed journal name is accurate. 
        try:
            if str(df._get_value(paperIndex, "JournalName")) == "JournalNotFound":
                journalTruth = 0.5
        except:
            pass
        try:
            if "." in df._get_value(paperIndex, "JournalName"):
                result = df._get_value(paperIndex, "JournalName").split(".")
                if result[1].strip() == refDict['journal']:
                    journalTruth = 1
        except:
            pass
        try:
            #groundTruthDict['JournalName'] = df.loc[paperIndex, 'JournalName']
            # First check the simple way
            if str(refDict['journal']) == str(df._get_value(paperIndex, "JournalName")):
                journalTruth = 1       
            else:    
                # Then check the truncated way
                truthList = df._get_value(paperIndex, "JournalName").split()
                if 'and' in truthList:
                    truthList.remove('and')
                if 'of' in truthList:
                    truthList.remove('of')
                if 'The' in truthList:
                    truthList.remove('The')
                parsedList = refDict['journal'].split()

                truncatedSituation = False
                if len(truthList) == len(parsedList):
                    for i in range(len(truthList)):
                        if len(truthList[i]) != len(parsedList[i]):
                            truncatedSituation = True
                            break
                if truncatedSituation:
                    #print('truncation')
                    validTruncation = True
                    for i in range(len(truthList)):
                        if parsedList[i].lower() not in truthList[i].lower():
                            #print("bad truncation")
                            #print("parsedList at i: ", parsedList[i])
                            #print("truthList at i: ", truthList[i])

                            validTruncation = False
                    if validTruncation:
                        journalTruth = 1
            # Account for the situtation in which there is a year in the journal name
            s = str(df._get_value(paperIndex, "JournalName"))
            result = s[:s.find('(')].strip()
            if str(refDict['journal']) == result:
                journalTruth = 1
        except:
            pass
        try:
            parsed = sorted(refDict['journal'])
            truth = sorted(df._get_value(paperIndex, "JournalName"))
            parsed = [x for x in parsed if x != ' ']
            parsed = [x for x in parsed if x != '-']
            parsed = [x for x in parsed if x != '.']
            parsed = [x for x in parsed if x != ',']
            parsed = [x.lower() for x in parsed]
            parsed = sorted("".join(parsed))
            truth = [x for x in truth if x != ' ']
            truth = [x for x in truth if x != '-']
            truth = [x for x in truth if x != '.']
            truth = [x for x in truth if x != ',']
            truth = [x.lower() for x in truth]
            truth = sorted("".join(truth))
            if parsed == truth:
                journalTruth = 1
        except:
            pass

        # PAGES
        try:
            if str(df._get_value(paperIndex, 'Page')) == "PageNotFound":
                pageTruth = 0.5
        except:
            pass
        try:
            #groundTruthDict['PageNumber'] = df.loc[paperIndex, 'PageNumber']
            if str(refDict['pages']) == str(df.loc[paperIndex, 'PageNumber']):
                pageTruth = 1
        except:
            pass

        # VOLUME
        try:
            if str(df._get_value(paperIndex, "Volume")) == "VolumeNotFound":
                volumeTruth = 0.5
        except:
            pass
        try:
            #groundTruthDict['Volume'] = df.loc[paperIndex, 'Volume']
            if str(refDict['volume']) == str(df._get_value(paperIndex, "Volume")):
                volumeTruth = 1
        except:
            pass

        # YEAR
        try:
            if str(df._get_value(paperIndex, "Year")) == "YearNotFound":
                yearTruth = 0.5
        except:
            pass

        try:
            #groundTruthDict['Year'] = df.loc[paperIndex, 'Year']
            if str(refDict['year']) == str(df._get_value(paperIndex, "Year")):
                yearTruth = 1
        except:
            pass

        # ISSUE
        try:
            if str(df._get_value(paperIndex, "Issue")) == "IssueNotFound":
                issueTruth = 0.5
        except:
            pass
        try:
            if str(refDict['issue'].strip()) == str(df._get_value(paperIndex, 'Issue').strip()):
                issueTruth = 1
        except:
            pass

        try:
            parsed = str(refDict['issue'])
            parsed = parsed.replace("(", "")
            parsed = parsed.replace(")", "")
            truth = str(df._get_value(paperIndex, 'Issue'))
            if parsed == truth:
                issueTruth = 1
        except:
            pass

        df2 = {'pubMedID': df.loc[paperIndex, 'PubMedIDs'],
               'title': titleTruth,'author': authorTruth,
              'journalName': journalTruth,
              'pageNumber': pageTruth,
              'volume': volumeTruth,
              'year': yearTruth,
              'issue': issueTruth}

        testResultsDF = testResultsDF.append(df2, ignore_index = True)


    pd.set_option('display.float_format', lambda x: '%.5f' % x)
    av_column = testResultsDF.mean(axis=0)
    display(av_column)



In [12]:
testParse(parser.ref)

KeyError: 'Makar, A B et al. “Formate assay in body fluids: application in methanol poisoning.” Biochemical medicine vol. 13,2 (1975): 117-26. doi:10.1016/0006-2944(75)90147-7'

In [13]:
bigTestParse("mla")

0.0  percent done
0.03  percent done
0.07  percent done
0.1  percent done
0.13  percent done
0.17  percent done
0.2  percent done
0.23  percent done
0.27  percent done
0.3  percent done
0.33  percent done
0.37  percent done
0.4  percent done
0.43  percent done
0.47  percent done
0.5  percent done
0.53  percent done
0.57  percent done
0.6  percent done
0.63  percent done
0.67  percent done
0.7  percent done
0.73  percent done
0.77  percent done
0.8  percent done
0.83  percent done
0.87  percent done
0.9  percent done
0.93  percent done
0.97  percent done
1.0  percent done
1.03  percent done
1.07  percent done
1.1  percent done
1.13  percent done
1.17  percent done
1.2  percent done
1.23  percent done
1.27  percent done
1.3  percent done
1.33  percent done
1.37  percent done
1.4  percent done
1.43  percent done
1.47  percent done
1.5  percent done
1.53  percent done
1.57  percent done
1.6  percent done
1.63  percent done
1.67  percent done
1.7  percent done
1.73  percent done
1.77  perce

14.4  percent done
14.43  percent done
14.47  percent done
14.5  percent done
14.53  percent done
14.57  percent done
14.6  percent done
14.63  percent done
14.67  percent done
14.7  percent done
14.73  percent done
14.77  percent done
14.8  percent done
14.83  percent done
14.87  percent done
14.9  percent done
14.93  percent done
14.97  percent done
15.0  percent done
15.03  percent done
15.07  percent done
15.1  percent done
15.13  percent done
15.17  percent done
15.2  percent done
15.23  percent done
15.27  percent done
15.3  percent done
15.33  percent done
15.37  percent done
15.4  percent done
15.43  percent done
15.47  percent done
15.5  percent done
15.53  percent done
15.57  percent done
15.6  percent done
15.63  percent done
15.67  percent done
15.7  percent done
15.73  percent done
15.77  percent done
15.8  percent done
15.83  percent done
15.87  percent done
15.9  percent done
15.93  percent done
15.97  percent done
16.0  percent done
16.03  percent done
16.07  percent do

28.3  percent done
28.34  percent done
28.37  percent done
28.4  percent done
28.44  percent done
28.47  percent done
28.5  percent done
28.54  percent done
28.57  percent done
28.6  percent done
28.64  percent done
28.67  percent done
28.7  percent done
28.74  percent done
28.77  percent done
28.8  percent done
28.84  percent done
28.87  percent done
28.9  percent done
28.94  percent done
28.97  percent done
29.0  percent done
29.04  percent done
29.07  percent done
29.1  percent done
29.14  percent done
29.17  percent done
29.2  percent done
29.24  percent done
29.27  percent done
29.3  percent done
29.34  percent done
29.37  percent done
29.4  percent done
29.44  percent done
29.47  percent done
29.5  percent done
29.54  percent done
29.57  percent done
29.6  percent done
29.64  percent done
29.67  percent done
29.7  percent done
29.74  percent done
29.77  percent done
29.8  percent done
29.84  percent done
29.87  percent done
29.9  percent done
29.94  percent done
29.97  percent do

42.2  percent done
42.24  percent done
42.27  percent done
42.3  percent done
42.34  percent done
42.37  percent done
42.4  percent done
42.44  percent done
42.47  percent done
42.5  percent done
42.54  percent done
42.57  percent done
42.6  percent done
42.64  percent done
42.67  percent done
42.7  percent done
42.74  percent done
42.77  percent done
42.8  percent done
42.84  percent done
42.87  percent done
42.9  percent done
42.94  percent done
42.97  percent done
43.0  percent done
43.04  percent done
43.07  percent done
43.1  percent done
43.14  percent done
43.17  percent done
43.2  percent done
43.24  percent done
43.27  percent done
43.3  percent done
43.34  percent done
43.37  percent done
43.4  percent done
43.44  percent done
43.47  percent done
43.5  percent done
43.54  percent done
43.57  percent done
43.6  percent done
43.64  percent done
43.67  percent done
43.7  percent done
43.74  percent done
43.77  percent done
43.8  percent done
43.84  percent done
43.87  percent do

56.1  percent done
56.14  percent done
56.17  percent done
56.2  percent done
56.24  percent done
56.27  percent done
56.3  percent done
56.34  percent done
56.37  percent done
56.4  percent done
56.44  percent done
56.47  percent done
56.5  percent done
56.54  percent done
56.57  percent done
56.6  percent done
56.64  percent done
56.67  percent done
56.7  percent done
56.74  percent done
56.77  percent done
56.8  percent done
56.84  percent done
56.87  percent done
56.9  percent done
56.94  percent done
56.97  percent done
57.0  percent done
57.04  percent done
57.07  percent done
57.1  percent done
57.14  percent done
57.17  percent done
57.2  percent done
57.24  percent done
57.27  percent done
57.3  percent done
57.34  percent done
57.37  percent done
57.4  percent done
57.44  percent done
57.47  percent done
57.5  percent done
57.54  percent done
57.57  percent done
57.6  percent done
57.64  percent done
57.67  percent done
57.7  percent done
57.74  percent done
57.77  percent do

70.0  percent done
70.04  percent done
70.07  percent done
70.1  percent done
70.14  percent done
70.17  percent done
70.2  percent done
70.24  percent done
70.27  percent done
70.3  percent done
70.34  percent done
70.37  percent done
70.4  percent done
70.44  percent done
70.47  percent done
70.5  percent done
70.54  percent done
70.57  percent done
70.6  percent done
70.64  percent done
70.67  percent done
70.7  percent done
70.74  percent done
70.77  percent done
70.8  percent done
70.84  percent done
70.87  percent done
70.9  percent done
70.94  percent done
70.97  percent done
71.0  percent done
71.04  percent done
71.07  percent done
71.1  percent done
71.14  percent done
71.17  percent done
71.2  percent done
71.24  percent done
71.27  percent done
71.3  percent done
71.34  percent done
71.37  percent done
71.4  percent done
71.44  percent done
71.47  percent done
71.5  percent done
71.54  percent done
71.57  percent done
71.6  percent done
71.64  percent done
71.67  percent do

83.77  percent done
83.81  percent done
83.84  percent done
83.87  percent done
83.91  percent done
83.94  percent done
83.97  percent done
84.01  percent done
84.04  percent done
84.07  percent done
84.11  percent done
84.14  percent done
84.17  percent done
84.21  percent done
84.24  percent done
84.27  percent done
84.31  percent done
84.34  percent done
84.37  percent done
84.41  percent done
84.44  percent done
84.47  percent done
84.51  percent done
84.54  percent done
84.57  percent done
84.61  percent done
84.64  percent done
84.67  percent done
84.71  percent done
84.74  percent done
84.77  percent done
84.81  percent done
84.84  percent done
84.87  percent done
84.91  percent done
84.94  percent done
84.97  percent done
85.01  percent done
85.04  percent done
85.07  percent done
85.11  percent done
85.14  percent done
85.17  percent done
85.21  percent done
85.24  percent done
85.27  percent done
85.31  percent done
85.34  percent done
85.37  percent done
85.41  percent done


97.44  percent done
97.47  percent done
97.51  percent done
97.54  percent done
97.57  percent done
97.61  percent done
97.64  percent done
97.67  percent done
97.71  percent done
97.74  percent done
97.77  percent done
97.81  percent done
97.84  percent done
97.87  percent done
97.91  percent done
97.94  percent done
97.97  percent done
98.01  percent done
98.04  percent done
98.07  percent done
98.11  percent done
98.14  percent done
98.17  percent done
98.21  percent done
98.24  percent done
98.27  percent done
98.31  percent done
98.34  percent done
98.37  percent done
98.41  percent done
98.44  percent done
98.47  percent done
98.51  percent done
98.54  percent done
98.57  percent done
98.61  percent done
98.64  percent done
98.67  percent done
98.71  percent done
98.74  percent done
98.77  percent done
98.81  percent done
98.84  percent done
98.87  percent done
98.91  percent done
98.94  percent done
98.97  percent done
99.01  percent done
99.04  percent done
99.07  percent done


pubMedID      15631.14018
title             0.88953
author            0.97633
journalName       0.86046
pageNumber        0.97993
volume            0.96766
year              0.99983
issue             0.95425
dtype: float64

In [14]:
bigTestParse("apa")

0.0  percent done
0.03  percent done
0.07  percent done
0.1  percent done
0.13  percent done
0.17  percent done
0.2  percent done
0.23  percent done
0.27  percent done
0.3  percent done
0.33  percent done
0.37  percent done
0.4  percent done
0.43  percent done
0.47  percent done
0.5  percent done
0.53  percent done
0.57  percent done
0.6  percent done
0.63  percent done
0.67  percent done
0.7  percent done
0.73  percent done
0.77  percent done
0.8  percent done
0.83  percent done
0.87  percent done
0.9  percent done
0.93  percent done
0.97  percent done
1.0  percent done
1.03  percent done
1.07  percent done
1.1  percent done
1.13  percent done
1.17  percent done
1.2  percent done
1.23  percent done
1.27  percent done
1.3  percent done
1.33  percent done
1.37  percent done
1.4  percent done
1.43  percent done
1.47  percent done
1.5  percent done
1.53  percent done
1.57  percent done
1.6  percent done
1.63  percent done
1.67  percent done
1.7  percent done
1.73  percent done
1.77  perce

14.4  percent done
14.43  percent done
14.47  percent done
14.5  percent done
14.53  percent done
14.57  percent done
14.6  percent done
14.63  percent done
14.67  percent done
14.7  percent done
14.73  percent done
14.77  percent done
14.8  percent done
14.83  percent done
14.87  percent done
14.9  percent done
14.93  percent done
14.97  percent done
15.0  percent done
15.03  percent done
15.07  percent done
15.1  percent done
15.13  percent done
15.17  percent done
15.2  percent done
15.23  percent done
15.27  percent done
15.3  percent done
15.33  percent done
15.37  percent done
15.4  percent done
15.43  percent done
15.47  percent done
15.5  percent done
15.53  percent done
15.57  percent done
15.6  percent done
15.63  percent done
15.67  percent done
15.7  percent done
15.73  percent done
15.77  percent done
15.8  percent done
15.83  percent done
15.87  percent done
15.9  percent done
15.93  percent done
15.97  percent done
16.0  percent done
16.03  percent done
16.07  percent do

28.3  percent done
28.34  percent done
28.37  percent done
28.4  percent done
28.44  percent done
28.47  percent done
28.5  percent done
28.54  percent done
28.57  percent done
28.6  percent done
28.64  percent done
28.67  percent done
28.7  percent done
28.74  percent done
28.77  percent done
28.8  percent done
28.84  percent done
28.87  percent done
28.9  percent done
28.94  percent done
28.97  percent done
29.0  percent done
29.04  percent done
29.07  percent done
29.1  percent done
29.14  percent done
29.17  percent done
29.2  percent done
29.24  percent done
29.27  percent done
29.3  percent done
29.34  percent done
29.37  percent done
29.4  percent done
29.44  percent done
29.47  percent done
29.5  percent done
29.54  percent done
29.57  percent done
29.6  percent done
29.64  percent done
29.67  percent done
29.7  percent done
29.74  percent done
29.77  percent done
29.8  percent done
29.84  percent done
29.87  percent done
29.9  percent done
29.94  percent done
29.97  percent do

42.2  percent done
42.24  percent done
42.27  percent done
42.3  percent done
42.34  percent done
42.37  percent done
42.4  percent done
42.44  percent done
42.47  percent done
42.5  percent done
42.54  percent done
42.57  percent done
42.6  percent done
42.64  percent done
42.67  percent done
42.7  percent done
42.74  percent done
42.77  percent done
42.8  percent done
42.84  percent done
42.87  percent done
42.9  percent done
42.94  percent done
42.97  percent done
43.0  percent done
43.04  percent done
43.07  percent done
43.1  percent done
43.14  percent done
43.17  percent done
43.2  percent done
43.24  percent done
43.27  percent done
43.3  percent done
43.34  percent done
43.37  percent done
43.4  percent done
43.44  percent done
43.47  percent done
43.5  percent done
43.54  percent done
43.57  percent done
43.6  percent done
43.64  percent done
43.67  percent done
43.7  percent done
43.74  percent done
43.77  percent done
43.8  percent done
43.84  percent done
43.87  percent do

56.1  percent done
56.14  percent done
56.17  percent done
56.2  percent done
56.24  percent done
56.27  percent done
56.3  percent done
56.34  percent done
56.37  percent done
56.4  percent done
56.44  percent done
56.47  percent done
56.5  percent done
56.54  percent done
56.57  percent done
56.6  percent done
56.64  percent done
56.67  percent done
56.7  percent done
56.74  percent done
56.77  percent done
56.8  percent done
56.84  percent done
56.87  percent done
56.9  percent done
56.94  percent done
56.97  percent done
57.0  percent done
57.04  percent done
57.07  percent done
57.1  percent done
57.14  percent done
57.17  percent done
57.2  percent done
57.24  percent done
57.27  percent done
57.3  percent done
57.34  percent done
57.37  percent done
57.4  percent done
57.44  percent done
57.47  percent done
57.5  percent done
57.54  percent done
57.57  percent done
57.6  percent done
57.64  percent done
57.67  percent done
57.7  percent done
57.74  percent done
57.77  percent do

70.0  percent done
70.04  percent done
70.07  percent done
70.1  percent done
70.14  percent done
70.17  percent done
70.2  percent done
70.24  percent done
70.27  percent done
70.3  percent done
70.34  percent done
70.37  percent done
70.4  percent done
70.44  percent done
70.47  percent done
70.5  percent done
70.54  percent done
70.57  percent done
70.6  percent done
70.64  percent done
70.67  percent done
70.7  percent done
70.74  percent done
70.77  percent done
70.8  percent done
70.84  percent done
70.87  percent done
70.9  percent done
70.94  percent done
70.97  percent done
71.0  percent done
71.04  percent done
71.07  percent done
71.1  percent done
71.14  percent done
71.17  percent done
71.2  percent done
71.24  percent done
71.27  percent done
71.3  percent done
71.34  percent done
71.37  percent done
71.4  percent done
71.44  percent done
71.47  percent done
71.5  percent done
71.54  percent done
71.57  percent done
71.6  percent done
71.64  percent done
71.67  percent do

83.77  percent done
83.81  percent done
83.84  percent done
83.87  percent done
83.91  percent done
83.94  percent done
83.97  percent done
84.01  percent done
84.04  percent done
84.07  percent done
84.11  percent done
84.14  percent done
84.17  percent done
84.21  percent done
84.24  percent done
84.27  percent done
84.31  percent done
84.34  percent done
84.37  percent done
84.41  percent done
84.44  percent done
84.47  percent done
84.51  percent done
84.54  percent done
84.57  percent done
84.61  percent done
84.64  percent done
84.67  percent done
84.71  percent done
84.74  percent done
84.77  percent done
84.81  percent done
84.84  percent done
84.87  percent done
84.91  percent done
84.94  percent done
84.97  percent done
85.01  percent done
85.04  percent done
85.07  percent done
85.11  percent done
85.14  percent done
85.17  percent done
85.21  percent done
85.24  percent done
85.27  percent done
85.31  percent done
85.34  percent done
85.37  percent done
85.41  percent done


97.44  percent done
97.47  percent done
97.51  percent done
97.54  percent done
97.57  percent done
97.61  percent done
97.64  percent done
97.67  percent done
97.71  percent done
97.74  percent done
97.77  percent done
97.81  percent done
97.84  percent done
97.87  percent done
97.91  percent done
97.94  percent done
97.97  percent done
98.01  percent done
98.04  percent done
98.07  percent done
98.11  percent done
98.14  percent done
98.17  percent done
98.21  percent done
98.24  percent done
98.27  percent done
98.31  percent done
98.34  percent done
98.37  percent done
98.41  percent done
98.44  percent done
98.47  percent done
98.51  percent done
98.54  percent done
98.57  percent done
98.61  percent done
98.64  percent done
98.67  percent done
98.71  percent done
98.74  percent done
98.77  percent done
98.81  percent done
98.84  percent done
98.87  percent done
98.91  percent done
98.94  percent done
98.97  percent done
99.01  percent done
99.04  percent done
99.07  percent done


pubMedID      15631.14018
title             0.87476
author            0.97778
journalName       0.87039
pageNumber        0.95466
volume            0.95550
year              1.00000
issue             0.95885
dtype: float64

In [None]:
bigTestParse("ama")

0.0  percent done
0.03  percent done
0.07  percent done
0.1  percent done
0.13  percent done
0.17  percent done
0.2  percent done
0.23  percent done
0.27  percent done
0.3  percent done
0.33  percent done
0.37  percent done
0.4  percent done
0.43  percent done
0.47  percent done
0.5  percent done
0.53  percent done
0.57  percent done
0.6  percent done
0.63  percent done
0.67  percent done
0.7  percent done
0.73  percent done
0.77  percent done
0.8  percent done
0.83  percent done
0.87  percent done
0.9  percent done
0.93  percent done
0.97  percent done
1.0  percent done
1.03  percent done
1.07  percent done
1.1  percent done
1.13  percent done
1.17  percent done
1.2  percent done
1.23  percent done
1.27  percent done
1.3  percent done
1.33  percent done
1.37  percent done
1.4  percent done
1.43  percent done
1.47  percent done
1.5  percent done
1.53  percent done
1.57  percent done
1.6  percent done
1.63  percent done
1.67  percent done
1.7  percent done
1.73  percent done
1.77  perce

14.4  percent done
14.43  percent done
14.47  percent done
14.5  percent done
14.53  percent done
14.57  percent done
14.6  percent done
14.63  percent done
14.67  percent done
14.7  percent done
14.73  percent done
14.77  percent done
14.8  percent done
14.83  percent done
14.87  percent done
14.9  percent done
14.93  percent done
14.97  percent done
15.0  percent done
15.03  percent done
15.07  percent done
15.1  percent done
15.13  percent done
15.17  percent done
15.2  percent done
15.23  percent done
15.27  percent done
15.3  percent done
15.33  percent done
15.37  percent done
15.4  percent done
15.43  percent done
15.47  percent done
15.5  percent done
15.53  percent done
15.57  percent done
15.6  percent done
15.63  percent done
15.67  percent done
15.7  percent done
15.73  percent done
15.77  percent done
15.8  percent done
15.83  percent done
15.87  percent done
15.9  percent done
15.93  percent done
15.97  percent done
16.0  percent done
16.03  percent done
16.07  percent do

28.3  percent done
28.34  percent done
28.37  percent done
28.4  percent done
28.44  percent done
28.47  percent done
28.5  percent done
28.54  percent done
28.57  percent done
28.6  percent done
28.64  percent done
28.67  percent done
28.7  percent done
28.74  percent done
28.77  percent done
28.8  percent done
28.84  percent done
28.87  percent done
28.9  percent done
28.94  percent done
28.97  percent done
29.0  percent done
29.04  percent done
29.07  percent done
29.1  percent done
29.14  percent done
29.17  percent done
29.2  percent done
29.24  percent done
29.27  percent done
29.3  percent done
29.34  percent done
29.37  percent done
29.4  percent done
29.44  percent done
29.47  percent done
29.5  percent done
29.54  percent done
29.57  percent done
29.6  percent done
29.64  percent done
29.67  percent done
29.7  percent done
29.74  percent done
29.77  percent done
29.8  percent done
29.84  percent done
29.87  percent done
29.9  percent done
29.94  percent done
29.97  percent do

42.2  percent done
42.24  percent done
42.27  percent done
42.3  percent done
42.34  percent done
42.37  percent done
42.4  percent done
42.44  percent done
42.47  percent done
42.5  percent done
42.54  percent done
42.57  percent done
42.6  percent done
42.64  percent done
42.67  percent done
42.7  percent done
42.74  percent done
42.77  percent done
42.8  percent done
42.84  percent done
42.87  percent done
42.9  percent done
42.94  percent done
42.97  percent done
43.0  percent done
43.04  percent done
43.07  percent done
43.1  percent done
43.14  percent done
43.17  percent done
43.2  percent done
43.24  percent done
43.27  percent done
43.3  percent done
43.34  percent done
43.37  percent done
43.4  percent done
43.44  percent done
43.47  percent done
43.5  percent done
43.54  percent done
43.57  percent done
43.6  percent done
43.64  percent done
43.67  percent done
43.7  percent done
43.74  percent done
43.77  percent done
43.8  percent done
43.84  percent done
43.87  percent do

56.1  percent done
56.14  percent done
56.17  percent done
56.2  percent done
56.24  percent done
56.27  percent done
56.3  percent done
56.34  percent done
56.37  percent done
56.4  percent done
56.44  percent done
56.47  percent done
56.5  percent done
56.54  percent done
56.57  percent done
56.6  percent done
56.64  percent done
56.67  percent done
56.7  percent done
56.74  percent done
56.77  percent done
56.8  percent done
56.84  percent done
56.87  percent done
56.9  percent done
56.94  percent done
56.97  percent done
57.0  percent done
57.04  percent done
57.07  percent done
57.1  percent done
57.14  percent done
57.17  percent done
57.2  percent done
57.24  percent done
57.27  percent done
57.3  percent done
57.34  percent done
57.37  percent done
57.4  percent done
57.44  percent done
57.47  percent done
57.5  percent done
57.54  percent done
57.57  percent done
57.6  percent done
57.64  percent done
57.67  percent done
57.7  percent done
57.74  percent done
57.77  percent do

70.0  percent done
70.04  percent done
70.07  percent done
70.1  percent done
70.14  percent done
70.17  percent done
70.2  percent done
70.24  percent done
70.27  percent done
70.3  percent done
70.34  percent done
70.37  percent done
70.4  percent done
70.44  percent done
70.47  percent done
70.5  percent done
70.54  percent done
70.57  percent done
70.6  percent done
70.64  percent done
70.67  percent done
70.7  percent done
70.74  percent done
70.77  percent done
70.8  percent done
70.84  percent done
70.87  percent done
70.9  percent done
70.94  percent done
70.97  percent done
71.0  percent done
71.04  percent done
71.07  percent done
71.1  percent done
71.14  percent done
71.17  percent done
71.2  percent done
71.24  percent done
71.27  percent done
71.3  percent done
71.34  percent done
71.37  percent done
71.4  percent done
71.44  percent done
71.47  percent done
71.5  percent done
71.54  percent done
71.57  percent done
71.6  percent done
71.64  percent done
71.67  percent do

83.77  percent done
83.81  percent done
83.84  percent done
83.87  percent done
83.91  percent done
83.94  percent done
83.97  percent done
84.01  percent done
84.04  percent done
84.07  percent done
84.11  percent done
84.14  percent done
84.17  percent done
84.21  percent done
84.24  percent done
84.27  percent done
84.31  percent done
84.34  percent done
84.37  percent done
84.41  percent done
84.44  percent done
84.47  percent done
84.51  percent done
84.54  percent done
84.57  percent done
84.61  percent done
84.64  percent done
84.67  percent done
84.71  percent done
84.74  percent done
84.77  percent done
84.81  percent done
84.84  percent done
84.87  percent done
84.91  percent done
84.94  percent done
84.97  percent done
85.01  percent done
85.04  percent done
85.07  percent done
85.11  percent done
85.14  percent done
85.17  percent done
85.21  percent done
85.24  percent done
85.27  percent done
85.31  percent done
85.34  percent done
85.37  percent done
85.41  percent done


97.44  percent done
97.47  percent done
97.51  percent done
97.54  percent done
97.57  percent done
97.61  percent done
97.64  percent done
97.67  percent done
97.71  percent done
97.74  percent done
97.77  percent done
97.81  percent done
97.84  percent done
97.87  percent done
97.91  percent done
97.94  percent done
97.97  percent done
98.01  percent done
98.04  percent done
98.07  percent done
98.11  percent done
98.14  percent done
98.17  percent done
98.21  percent done
98.24  percent done
98.27  percent done
98.31  percent done
98.34  percent done
98.37  percent done
98.41  percent done
98.44  percent done
98.47  percent done
98.51  percent done
98.54  percent done
98.57  percent done
98.61  percent done
98.64  percent done
98.67  percent done
98.71  percent done
98.74  percent done
98.77  percent done
98.81  percent done
98.84  percent done
98.87  percent done
98.91  percent done
98.94  percent done
98.97  percent done
99.01  percent done
99.04  percent done
99.07  percent done


In [50]:
# TESTING ALL ARTICLES IN THE JSON

errorList = []
style = 'mla'

testResultsDF = pd.DataFrame(columns = ['pubMedID',
                                            'title',
                                            'author',
                                            'journalName',
                                            'pageNumber',
                                            'volume',
                                            'year',
                                            'issue'])
fullLength = len(df)

for paperIndex in range(len(df)):
    if paperIndex % 10 == 0:
        print(round((paperIndex/fullLength)*100, 2), " percent done")

    # Get raw reference string from the loaded df which has both ground truth and raw ref
    rawRefString = df.loc[paperIndex, style]

    # Get parsed reference as a dictionary
    refDict = parseReference(rawRefString, True)

    groundTruthDict = {}

    titleTruth = 0
    authorTruth = 0
    journalTruth = 0
    pageTruth = 0
    volumeTruth = 0
    yearTruth = 0
    issueTruth = 0

    # TITLE
    try:
        if str(df.loc[paperIndex, 'Title']) == "TitleNotFound":
            titleTruth = 0.5
    except:
        pass
    try:
        #groundTruthDict['Title'] = df.loc[paperIndex, 'Title']
        if str(refDict['title']) == str(df.loc[paperIndex, 'Title']):
            titleTruth = 1
            
        parsed = sorted(refDict['title'])
        parsed = [x for x in parsed if x != ' ']
        parsed = [x for x in parsed if x != '-']
        parsed = [x for x in parsed if x != '.']
        parsed = [x for x in parsed if x != '[']
        parsed = [x for x in parsed if x != ']']
        #print(parsed)
        truth = sorted(df.loc[paperIndex, 'Title'])
        truth = [x for x in truth if x != ' ']
        truth = [x for x in truth if x != '-']
        truth = [x for x in truth if x != '.']
        truth = [x for x in truth if x != '[']
        truth = [x for x in truth if x != ']']
        #print(truth)
        if parsed == truth:
            titleTruth = 1
    except:
        pass
    
    # AUTHOR
    try:
        if str(df.loc[paperIndex, 'Author']) == "AuthorNotFound":
            authorTruth = 0.5
    except:
        pass
    try:
        if not df.loc[paperIndex, 'Author']:
            authorTruth = 0.5
    except:
        pass
    try:
        parsed = sorted(refDict['author'])
        truth = sorted("".join(df._get_value(paperIndex, "Author")))
        parsed = [x for x in parsed if x != ' ']
        parsed = [x for x in parsed if x != '-']
        parsed = [x for x in parsed if x != '.']
        parsed = [x for x in parsed if x != ',']
        truth = [x for x in truth if x != ' ']
        truth = [x for x in truth if x != '-']
        truth = [x for x in truth if x != '.']
        truth = [x for x in truth if x != ',']
        if parsed == truth:
            authorTruth = 1
            
        result = refDict['author'].split()
        result = [x.replace(".", "") for x in result]
        theString = result[1] + " " + result[2] + " " + result[0]
        parsed = sorted("".join(result))
        truth = sorted(theString)
        if parsed == truth:
            authorTruth = 1
        if theString == df._get_value(paperIndex, "Author")[0]:
            authorTruth = 1
    except:
        pass
    try:
        #groundTruthDict['Author'] = df.loc[paperIndex, 'Author']
        # Here we are comparing lists, so there's no need to cast to str
        if refDict['author'] == df.loc[paperIndex, 'Author']:
            authorTruth = 1

        try:
            if refDict['author'][0] == df.loc[paperIndex, 'Author'][0]:
                authorTruth = 1
        except:
            pass
    except:
        pass
    try:
        result = refDict['author'].split()
        result = [x.replace(".", "") for x in result]
        theString = result[1] + " " + result[0]
        if theString == df.loc[paperIndex, "Author"][0]:
            authorTruth = 1
    except:
        pass
    try:
        a = sorted(refDict['author'][0].replace(" ", ""))
        b = sorted(df.loc[paperIndex, 'Author'][0].replace(" ", ""))
        if a == b:
            authorTruth = 1
    except:
        pass
    try:
        if 'et al' in rawRefString:
            parsed = sorted(refDict['author'])
            truth = sorted(df._get_value(paperIndex, "Author")[0])
            parsed = [x for x in parsed if x != ' ']
            parsed = [x for x in parsed if x != '-']
            parsed = [x for x in parsed if x != '.']
            parsed = [x for x in parsed if x != ',']
            truth = [x for x in truth if x != ' ']
            truth = [x for x in truth if x != '-']
            truth = [x for x in truth if x != '.']
            truth = [x for x in truth if x != ',']
            if parsed == truth:
                authorTruth = 1
    except:
        pass

    # JOURNAL
    # One issue here is that the parsed output will be a journal name that is
    # in an abbreviated/truncated form. However, it's still clearly the same
    # journal name! This solution proposes that if we split up the ground truth
    # journal name and the parsed journal name (potentially truncated) into lists
    # where each element is a word, then if each word in the parsed name is a
    # substring of it's corresponding word in the ground truth name, we say
    # the parsed journal name is accurate. 
    try:
        if str(df.loc[paperIndex, 'JournalName']) == "JournalNotFound":
            journalTruth = 0.5
    except:
        pass
    try:
        if "." in df.loc[paperIndex, 'JournalName']:
            result = df.loc[paperIndex, 'JournalName'].split(".")
            if result[1].strip() == refDict['journal']:
                journalTruth = 1
    except:
        pass
    try:
        #groundTruthDict['JournalName'] = df.loc[paperIndex, 'JournalName']
        # First check the simple way
        if str(refDict['journal']) == str(df.loc[paperIndex, 'JournalName']):
            journalTruth = 1       
        else:    
            # Then check the truncated way
            truthList = df.loc[paperIndex, 'JournalName'].split()
            if 'and' in truthList:
                truthList.remove('and')
            if 'of' in truthList:
                truthList.remove('of')
            if 'The' in truthList:
                truthList.remove('The')
            parsedList = refDict['journal'].split()

            truncatedSituation = False
            if len(truthList) == len(parsedList):
                for i in range(len(truthList)):
                    if len(truthList[i]) != len(parsedList[i]):
                        truncatedSituation = True
                        break
            if truncatedSituation:
                #print('truncation')
                validTruncation = True
                for i in range(len(truthList)):
                    if parsedList[i].lower() not in truthList[i].lower():
                        #print("bad truncation")
                        #print("parsedList at i: ", parsedList[i])
                        #print("truthList at i: ", truthList[i])

                        validTruncation = False
                if validTruncation:
                    journalTruth = 1
        # Account for the situtation in which there is a year in the journal name
        s = str(df.loc[paperIndex, 'JournalName'])
        result = s[:s.find('(')].strip()
        if str(refDict['journal']) == result:
            journalTruth = 1
    except:
        pass
    try:
        parsed = sorted(refDict['journal'])
        truth = sorted(df._get_value(paperIndex, "JournalName"))
        parsed = [x for x in parsed if x != ' ']
        parsed = [x for x in parsed if x != '-']
        parsed = [x for x in parsed if x != '.']
        parsed = [x for x in parsed if x != ',']
        parsed = [x.lower() for x in parsed]
        parsed = sorted("".join(parsed))
        truth = [x for x in truth if x != ' ']
        truth = [x for x in truth if x != '-']
        truth = [x for x in truth if x != '.']
        truth = [x for x in truth if x != ',']
        truth = [x.lower() for x in truth]
        truth = sorted("".join(truth))
        if parsed == truth:
            journalTruth = 1
    except:
        pass

    # PAGES
    try:
        if str(df.loc[paperIndex, 'Page']) == "PageNotFound":
            pageTruth = 0.5
    except:
        pass
    try:
        #groundTruthDict['PageNumber'] = df.loc[paperIndex, 'PageNumber']
        if str(refDict['pages']) == str(df.loc[paperIndex, 'PageNumber']):
            pageTruth = 1
    except:
        pass
    
    # VOLUME
    try:
        if str(df.loc[paperIndex, 'Volume']) == "VolumeNotFound":
            volumeTruth = 0.5
    except:
        pass
    try:
        #groundTruthDict['Volume'] = df.loc[paperIndex, 'Volume']
        if str(refDict['volume']) == str(df.loc[paperIndex, 'Volume']):
            volumeTruth = 1
    except:
        pass
    
    # YEAR
    try:
        if str(df.loc[paperIndex, 'Year']) == "YearNotFound":
            yearTruth = 0.5
    except:
        pass
        
    try:
        #groundTruthDict['Year'] = df.loc[paperIndex, 'Year']
        if str(refDict['year']) == str(df.loc[paperIndex, 'Year']):
            yearTruth = 1
    except:
        pass
    
    # ISSUE
    try:
        if str(df.loc[paperIndex, 'Issue']) == "IssueNotFound":
            issueTruth = 0.5
    except:
        pass
    try:
        if str(refDict['issue'].strip()) == str(df._get_value(paperIndex, 'Issue').strip()):
            issueTruth = 1
    except:
        pass
    
    try:
        parsed = str(refDict['issue'])
        parsed = parsed.replace("(", "")
        parsed = parsed.replace(")", "")
        truth = str(df._get_value(paperIndex, 'Issue'))
        if parsed == truth:
            issueTruth = 1
    except:
        pass
    
    df2 = {'pubMedID': df.loc[paperIndex, 'PubMedIDs'],
           'title': titleTruth,'author': authorTruth,
          'journalName': journalTruth,
          'pageNumber': pageTruth,
          'volume': volumeTruth,
          'year': yearTruth,
          'issue': issueTruth}

    testResultsDF = testResultsDF.append(df2, ignore_index = True)
    if issueTruth == 0:
        errorList.append(paperIndex)
        print("got one!")


# pd.set_option('display.float_format', lambda x: '%.5f' % x)
# av_column = testResultsDF.mean(axis=0)
# display(av_column)



0.0  percent done
0.03  percent done
0.07  percent done
0.1  percent done
0.13  percent done
0.17  percent done
0.2  percent done
0.23  percent done
0.27  percent done
0.3  percent done
0.33  percent done
0.37  percent done
0.4  percent done
0.43  percent done
0.47  percent done
0.5  percent done
0.53  percent done
0.57  percent done
0.6  percent done
0.63  percent done
0.67  percent done
0.7  percent done
0.73  percent done
0.77  percent done
0.8  percent done
0.83  percent done
0.87  percent done
0.9  percent done
0.93  percent done
0.97  percent done
1.0  percent done
1.03  percent done
1.07  percent done
1.1  percent done
1.13  percent done
1.17  percent done
1.2  percent done
1.23  percent done
1.27  percent done
1.3  percent done
1.33  percent done
1.37  percent done
1.4  percent done
1.43  percent done
1.47  percent done
1.5  percent done
1.53  percent done
1.57  percent done
1.6  percent done
1.63  percent done
1.67  percent done
1.7  percent done
1.73  percent done
1.77  perce

KeyboardInterrupt: 

In [None]:
df._get_value(14, "ama")

In [38]:
errorList

[0, 1, 2, 3, 4, 5, 6, 7, 8]

In [None]:
paperIndex

In [None]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)
av_column = testResultsDF.mean(axis=0)
display(av_column)