## Metrics on everything

### Imports

In [1]:
import pandas as pd
import numpy as np
import os
import difflib
import re
import json
import spacy
import en_core_web_lg
nlp_lg = spacy.load('en_core_web_lg')
from spacy.matcher   import Matcher
from IPython.display import clear_output

### Functions

In [2]:
# Returns a list of lists of files contained within folders
# Pass the root folder as a string eg: 'folder\\'
def listAllFilePairs(rootFolder):
    
    # Start with empty list
    returnList = list()
    
    # Walk down from the source folder
    for (dirpath, dirnames, filenames) in os.walk(rootFolder):
        
        # If there are files in the folder
        if (len(filenames) > 0):
            
            # New list
            files = []
            
            # Add the list of filenames to the return list
            for fileName in filenames:
                
                # Add to the list of files
                files += [os.path.join(dirpath, fileName)]
                                       
            # Add to the return list
            returnList += [files]
                                       
    # Finally, return the list
    return returnList

In [3]:
# Returns a list of tuples contains all paths and files of a specific type in a directory and its sub directory
# Pass an extension to override .json
def listAllFiles(folder, extension = '.json'):
    files = list()
    for (dirpath, dirnames, filenames) in os.walk(folder):
        files += [(dirpath, file) for file in filenames if file.endswith(extension)]
    return files

In [182]:
def getEntitiesandRelations(annotationsFile, txtFile):

    # Open JSON file with annotations
    file                      = open(annotationsFile, encoding = "utf-8")
    annotationsDoc            = file.read()
    file.close()

    # Load the annotations
    annotations               = json.loads(annotationsDoc)

    # Read the matching .txt file
    file                      = open(txtFile, encoding = "utf-8")
    plainText                 = file.read()
    file.close()

    # tokenize the .txt file
    doc                       = nlp_lg(plainText)

    # Extract relevant information from JSON files for entities
    entities                  = extractEntities(annotations)

    # Offset the entities
    entities                  = offsetEntities(entities, plainText)
#
    # Add the start token to the dataframe
    entities['Start_Token']   = entities.adj_Start.apply(lambda x: get_token_num_for_char(doc, x))

    # Add the end token to the dataframe
    entities['End_Token']     = [get_token_num_for_end_char(doc, entities.loc[X]['Start_Token'], entities.loc[X]['adj_doctext']) for X in entities.index]

    #Create a dictionary for relationship indices
    positionDict              = dict(zip(entities.Start_Index, entities.adj_Start))

    # Filter to specific columns 
    entities                  = entities[['adj_doctext', 'Start_Token','End_Token','adj_Start', 'adj_End', 'classId']]
       
    # Add columns back in
    entities.columns          = ['Text', 'Start_Token','End_Token','Start_Index','End_Index','classId']
   
    # Get rid of the junk rows
    if (annotationsFile.split('\\')[1] == 'OldSet'):
        entities = filterOld(entities)
    else:
        entities = filterNew(entities)
  
    # Set that start token
    entities['Sentence']      = [doc[entities.loc[X]['Start_Token']].sent for X in entities.index]
  
    # Find abstract entities
    entities                  = findAbstractEntities(entities, doc)
     
    # Calculate the TFSIF
    entities['max_TFISF']     = calculateTFISF(zip(entities.Start_Token, entities.End_Token), doc) 
  
    # I think this is treated as a boolean value, based on what the value os is_sent_start is but I don't know what is_sent_start is
    entities['Sent_Start']    = [ # List comprehension
                                  1 if   doc[entities.loc[index]['Start_Token']].is_sent_start == True 
                                  else 0 
                                  for index in entities.index
                                ]

    # Add the name of the document to a document column in the entities dataframe
    entities['Document']      = txtFile.split('\\')[-1][:-4] 
    entities['AnnTxtFile']    = txtFile

    # Make a lit of columns to downcast
    cols_to_downcast          = ['Start_Token', 
                                 'End_Token', 
                                 'Start_Index',
                                 'End_Index', 
                                 'inAbstract300', 
                                 'inAbstract500', 
                                 'Sent_Start']

    # Downcast the columns
    entities[cols_to_downcast] = entities[cols_to_downcast].apply(pd.to_numeric, downcast='integer')

    # Create dataframes of annotated species and locations 
    if (annotationsFile.split('\\')[1] == 'OldSet'):
        species   = filterSpeciesOld(entities)
        locations = filterLocationsOld(entities)
    else:
        species   = filterSpeciesNew(entities)
        locations = filterLocationsNew(entities)
        
    species.drop(columns   = ['classId'])  
    locations.drop(columns = ['classId'])
    
    # Extract the relationships from the Annotations file
    relationships              = extractRelationships(annotations)
    
    #Rearrange order of entities so species always appears on left hand side of the pair 
    relationships[['entity1-type','entity1-start','entity1-end','entity2-type','entity2-start','entity2-end']] = relationships[['entity1-type','entity1-start','entity1-end','entity2-type','entity2-start','entity2-end']].mask(
        relationships['entity1-type'] == 'location', relationships[['entity2-type','entity2-start','entity2-end','entity1-type','entity1-start','entity1-end']].values)
    
    # Adjust the relationships indices
    relationships = adjustRelationshipEntityIndices(relationships, positionDict)

    # Insert names into dataframe
    relationships = insertEntityNames(relationships, plainText)

    # Insert token positions to the dataframe
    relationships = insertTokenNumbers(relationships, doc)

    # Add Ground Truth Value for predictions
    relationships['Tagged_Relationship'] = 1

    # Tidy the whole thing up
    relationships.drop(columns = ['entity1-class',
                                  'entity2-class',
                                  'entity1-type', 
                                  'entity2-type',
                                  'entity1-start',
                                  'entity1-end',
                                  'entity2-start',
                                  'entity2-end'], inplace = True)

    relationships.columns      = ['Start_Index_Species', 
                                  'End_Index_Species',
                                  'Start_Index_Location', 
                                  'End_Index_Location', 
                                  'Species', 
                                  'Location', 
                                  'Start_Token_Species',
                                  'End_Token_Species',
                                  'Start_Token_Location',
                                  'End_Token_Location',
                                  'Tagged_Relationship']

    relationships               = relationships[['Species', 
                                   'Start_Token_Species',
                                   'End_Token_Species',
                                   'Start_Index_Species', 
                                   'End_Index_Species', 
                                   'Location',
                                   'Start_Token_Location',
                                   'End_Token_Location',
                                   'Start_Index_Location', 
                                   'End_Index_Location',
                                   'Tagged_Relationship']]

    # return three dataframes
    return species, locations, relationships

In [5]:
def calculateTFISF(tokens, doc):
    results       = []
    numSentsinDoc = len(list(doc.sents))
    
    for startToken,endToken in tokens:
        maxtfisf = 0
        for tkn in range(startToken,endToken):
            if doc[tkn].is_alpha:
                tf       = len([1 for word in doc[tkn].sent if doc[tkn].text == str(word)])
                isf      = np.log(numSentsinDoc / len([1 for sent in doc.sents if doc[tkn].text in str(sent)]))
                tfisf    = tf*isf
                maxtfisf = max(maxtfisf, tfisf)
        results.append(maxtfisf)
    return results

In [6]:
def findAbstractStart(doc):
    matcher = Matcher(nlp_lg.vocab)
    pattern = [
        [{"LOWER": "abstract"}]
    ]
    matcher.add("abstract_pattern", pattern)
    matches = matcher(doc)
    
    if len(matches) >0:
        AbstractToken = matches[0][1]
    else: AbstractToken = -1
        
    return AbstractToken

In [7]:
def inAbstract(tokens, AbstractStart, doc, limit):
    if AbstractStart == -1:
        return np.zeros(len(list(tokens)))
    else:
        results = []
    
    for startToken,endToken in tokens:
        matcher = Matcher(nlp_lg.vocab)
        pattern = []
        
        for tkn in range (startToken,endToken):
            pattern.append({"TEXT": doc[tkn].text})
            
        patterns = [pattern]
        matcher.add("species_pattern", patterns)
        matches  = matcher(doc)
        idx      = [match[1] for match in matches if match[1] in range(AbstractStart,AbstractStart+limit)]
        
        if len(idx)>0:
            results.append(1)
        else: results.append(0)
            
    return results

In [8]:
def get_token_num_for_char(doc, start_idx):
    for i, token in enumerate(doc):
        if start_idx > token.idx:
            continue
        if start_idx == token.idx:
            return i
        if start_idx < token.idx:
            return i - 1

In [9]:
def get_token_num_for_end_char(doc, start_token, text):
    end_token = start_token+1
    span      = doc[start_token:end_token]
    
    while len(span.text)< len(text):
        end_token += 1
        span       = doc[start_token:end_token]
        
    return end_token

In [10]:
# Extracts entities from the parsed annotations JSON file
# Must be passed the annotations in parsed JSON format
def extractEntities(annotations):
    
    # Create data frame from Entities in JSON
    entities                  = pd.DataFrame(annotations['entities'], columns = ['classId', 'offsets'])
    
    # Define columns in the dataframe
    entities['Start_Index']   = [X[0].get('start') for X in entities['offsets']]
    entities['Text']          = [X[0].get('text') for X in entities['offsets']]
    entities['End_Index']     = [len(X[0].get('text'))+X[0].get('start') for X in entities['offsets']]
    entities                  = entities[['Text', 'Start_Index', 'End_Index','classId']]
    
    # Return the dataframe
    return entities

In [181]:
# Off set entities 
def offsetEntities(entities, plainText):

    # Offset the entities is a bit convoluted so I heavily commented it
    # TagTog outputs its txt files as html. The txt is split into <pre>'s. 
    # The annotations file's start index keeps track of the character count from the start of the <pre>, 
    # Not the start of the txt, as is the way we calculate it
    # Subsequently, each new pre resets the start index from 0. 
    # Because we work from the start of the txt file, we have to keep track of the last start/end index and work from there

    # First, we need to add the working columns to the dataframe
    entities['newStartIndex'] = 0
    entities['newEndIndex']   = 0
    entities['lastListedSI']  = 0
    entities['offsetR']       = 0
    entities['offsetL']       = 0
    entities['adj_Start']     = 0
    entities['adj_End']       = 0
    entities['adj_doctext']   = ""

    # We also need to keep track of the last items Start Index
    lastSI                    = 0 # Stored per index
    startIndex                = 0 # Calculated per index
    lastListedSI              = 0 # Required for calculations

    # And the last items End Index
    lastEI                    = 0
    endIndex                  = 0

    # This boolean lets us know if we've gone past the first <pre>
    reset                     = False

    # Iterate through the entities
    for index in entities.index:

        # Check if the start index has reset
        listedSI = entities.at[index, 'Start_Index']

        # Most of the entries for longer pdfs will be outside of the first <pre> so we start with the reset check as
        # it will speed up the process
        if reset == True:

            # If it's reset but we've reached a new <pre> we don't need to do anything fancy
            if listedSI < lastListedSI:

                # So we add the set startIndex to the last known start index (stored in lastSI)
                startIndex = lastSI + listedSI 

            # Otherwise...
            else:

                # We need to manipulate the startIndex to find the new accurate start index in the text document
                startIndex = listedSI - lastListedSI + lastSI

        # If we are still in the first <pre> we need to check for the first entry of the next <pre>
        elif listedSI < lastSI:

            # We've now entered the next <pre> so we add the listed startIndex to the last known start index (stored in lastSI)
            startIndex = lastSI + listedSI 

            # And set the reset flag as true to indicate we have gone past the first pre
            reset      = True

        # Otherwise
        else:

            # We are still in the first <pre> so we just use the accurate index
            startIndex = listedSI

        # Here we keep track of the index variables needed on the next iteration
        lastSI       = startIndex
        lastListedSI = listedSI
        endIndex     = startIndex + len(entities.at[index, 'Text']) 
        lastEI       = endIndex

        # Now the offset work begins
        # Find the offset on the right
        offsetR = startIndex - plainText[:endIndex].rfind(entities.loc[index]['Text'])

        # Set it to a valid high integer if it went off the text
        if offsetR == -1:

            entities.at[index, 'offsetR'] = 99999999

        else:

            entities.at[index, 'offsetR'] = offsetR        

        # Find the offset on the left
        offsetL = plainText[startIndex:].find(entities.loc[index]['Text'])

        # Set it to a valid high integer if it went off the text
        if offsetL == -1:

            entities.at[index, 'offsetL'] = 99999999

        else:
            entities.at[index, 'offsetL'] = offsetL

        # Check which side we need to offset on and do it
        if offsetR < offsetL:

            # Adjust it left 
            adjustedStart = startIndex - offsetR

        else: 

            # Adjust it right
            adjustedStart = startIndex + offsetL

        # Set the adjusted start
        entities.at[index, 'adj_Start']      = adjustedStart

        # Set the adjusted end
        adjustedEnd                          = endIndex - startIndex + adjustedStart
        entities.at[index, 'adj_End']        = adjustedEnd - 1

        # Set the adjusted text
        entities.at[index, 'adj_doctext']    = plainText[adjustedStart: adjustedEnd]

    # Return the data frame
    return entities

In [12]:
# Filter to only the relevant TagTog annotations in the old data
# We need to find a way to standardise the annotations both between trees but also between projects
def filterNew(entities):
    return entities[(entities.classId == 'e_1' )    #Bryophytes
                  | (entities.classId == 'e_2' )    #Nematoda
                  | (entities.classId == 'e_4' )    #Location
                  | (entities.classId == 'e_41')    #Cyanobacteria
                  | (entities.classId == 'e_49')    #Rotifers
                  | (entities.classId == 'e_75')    #Algae
                  | (entities.classId == 'e_38')    #Acari
                  | (entities.classId == 'e_55')    #Cyanobacteria
                  | (entities.classId == 'e_43')    #Mites
                  | (entities.classId == 'e_32')    #Moss
                  | (entities.classId == 'e_53')    #Rotier
                  | (entities.classId == 'e_54')    #Protist
                  | (entities.classId == 'e_57')    #Micro_Molecular
                  | (entities.classId == 'e_7' )    #Lichen
                  | (entities.classId == 'e_33')    #Nematode
                  | (entities.classId == 'e_44')    #SpingTail
                  | (entities.classId == 'e_48')    #Tardigrades
                  | (entities.classId == 'e_39')    #Collembola
                  | (entities.classId == 'e_40')    #Algae
                  | (entities.classId == 'e_51')    #Tardigrade
                  | (entities.classId == 'e_31')    #Lichen
                  | (entities.classId == 'e_50')]   #Protists   
                    

In [13]:
# Returns only entries annotated as species in the new documents
def filterSpeciesNew(entities):
    return entities[(entities.classId != 'e_4' )]

In [14]:
# Returns only entries annotated as locations in the new documents
def filterLocationsNew(entities):
    return entities[(entities.classId == 'e_4' )]

In [15]:
# Filter to only the relevant TagTog annotations in the old data
def filterOld(entities):
    return entities[(entities.classId == 'e_1' )    #Species
                  | (entities.classId == 'e_6' )    #Taxa
                  | (entities.classId == 'e_2' )]   #Locations 

In [16]:
# Returns only entries annotated as species in the old documents
def filterSpeciesOld(entities):
    return entities[(entities.classId != 'e_2' )]

In [17]:
# Returns only entries annotated as locations in the old documents
def filterLocationsOld(entities):
    return entities[(entities.classId == 'e_2' )]

In [18]:
def findAbstractEntities(entities, doc):
    
    # Get the abstract
    AbstractStart   = findAbstractStart(doc)

    # Make two lists of entities within the abstract
    entities['inAbstract300'] = inAbstract(zip(entities.Start_Token, entities.End_Token), AbstractStart, doc, 300) 
    entities['inAbstract500'] = inAbstract(zip(entities.Start_Token, entities.End_Token), AbstractStart, doc, 500) 
    
    # Return
    return entities

In [19]:
# Extract relevant information from JSON file for relationships
# Requires you to pass in the parsed annotations
def extractRelationships(annotations):

    # Create a data frame using current data
    relationships                  = pd.DataFrame(annotations['relations'], columns = ['entities'])

    # Add desire columns to the data frame
    # for entity one
    relationships['entity1-class']  = ""
    relationships['entity1-type']   = ""
    relationships['entity1-start']  = 0
    relationships['entity1-end']    = 0

    # for entity two
    relationships['entity2-class']  = ""
    relationships['entity2-type']   = ""
    relationships['entity2-start']  = 0
    relationships['entity2-end']    = 0

    # Iterate through the current data
    for index in relationships.index:

        # Extract entities
        entityOne, entityTwo                         = relationships.at[index, 'entities']

        # Extract relevant details
        # Entity one
        throwAway, entityOneClass, entityOneStartEnd = entityOne.split('|')
        entityOneStart, entityOneEnd                 = entityOneStartEnd.split(',')

        relationships.at[index, 'entity1-class']     = entityOneClass
        relationships.at[index, 'entity1-type']      = "location" if entityOneClass == 'e_4' else "species"
        relationships.at[index, 'entity1-start']     = entityOneStart
        relationships.at[index, 'entity1-end']       = entityOneEnd

        # Entity two
        throwAway, entityTwoClass, entityTwoStartEnd = entityTwo.split('|')
        entityTwoStart, entityTwoEnd                 = entityTwoStartEnd.split(',')

        relationships.at[index, 'entity2-class']     = entityTwoClass 
        relationships.at[index, 'entity2-type']      = "location" if entityTwoClass == 'e_4' else "species"
        relationships.at[index, 'entity2-start']     = entityTwoStart
        relationships.at[index, 'entity2-end']       = entityTwoEnd

    # Dopr entities column    
    relationships.drop(columns = 'entities', inplace=True)
    
    # Return
    return relationships

In [83]:
# Insert actual text of the entities into the relationships dataframe
def insertEntityNames(relationships, document):

    # Iterate through the relationships
    for index in relationships.index:
        
        # Entity One
        relationships.at[index, 'entity1'] = document[relationships.at[index, 'entity1-adjstart']: 
                                                      relationships.at[index, 'entity1-adjend'] + 1] 

        # Entity Two
        relationships.at[index, 'entity2'] = document[relationships.at[index, 'entity2-adjstart']: 
                                                      relationships.at[index, 'entity2-adjend'] + 1] 
    return relationships

In [21]:
# Adjust entity indices to match document
def adjustRelationshipEntityIndices(relationships, positionDict):

    # Create the desired columns
    relationships['entity1-adjstart'] = 0
    relationships['entity1-adjend']   = 0
    relationships['entity2-adjstart'] = 0
    relationships['entity2-adjend']   = 0

    # Iterate through relationships
    for index in relationships.index:

        # Get adjusted start for Entity One
        relationships.at[index, 'entity1-adjstart'] = positionDict.get(relationships.at[index, 'entity1-start'])

        # Calculate adjusted end for Entity Two
        entity1End    = relationships.at[index, 'entity1-end']
        entity1Start  = relationships.at[index, 'entity1-start']
        entity1AdjEnd = relationships.at[index, 'entity1-adjstart']
        relationships.at[index, 'entity1-adjend']   =  entity1End - entity1Start + entity1AdjEnd

        # Get adjusted start for Entity One
        relationships.at[index, 'entity2-adjstart'] = positionDict.get(relationships.at[index, 'entity2-start'])

        # Calculate adjusted end for Entity Two
        entity2End    = relationships.at[index, 'entity2-end']
        entity2Start  = relationships.at[index, 'entity2-start']
        entity2AdjEnd = relationships.at[index, 'entity2-adjstart']
        relationships.at[index, 'entity2-adjend']   =  entity2End - entity2Start + entity2AdjEnd
        
    # Return
    return relationships

In [22]:
# Add token numbers to the relationships tables
def insertTokenNumbers(relationships, doc):

    # Add desires columns
    relationships['Start_Token_Species'] = 0
    relationships['End_Token_Species']   = 0

    # Iterate through the relationships
    for index in relationships.index:

        # Get the starting tokens
        # Species
        tokenNum = relationships.at[index, 'entity1-adjstart']
        relationships.at[index, 'Start_Token_Species']  = get_token_num_for_char(doc, tokenNum)

        # Location
        tokenNum = relationships.at[index, 'entity2-adjstart']
        relationships.at[index, 'Start_Token_Location'] = get_token_num_for_char(doc, tokenNum)

        # Get end tokens
        # Species
        relationships.at[index, 'End_Token_Species']    = get_token_num_for_end_char(
             doc, relationships.at[index, 'Start_Token_Species'], relationships.at[index, 'entity1'])

        # Location
        relationships.at[index, 'End_Token_Location']    = get_token_num_for_end_char(
         doc, relationships.at[index, 'Start_Token_Location'], relationships.at[index, 'entity2'])
        
    # Return relationships
    return relationships

In [23]:
# Iterate through the Extracted CSV folder and find the CSV
def findExtractedCSV(txtFile, CSVFolder, fileTag):
    
    # Check if we are using oldset or new set
    # Then get the CSV name
    if txtFile.split('\\')[1] == 'OldSet':
        speciesCSVFileName = re.sub('.txt.txt', fileTag, txtFile)
    else:
        speciesCSVFileName = re.sub('.pdf.txt', fileTag, txtFile)
    speciesCSVFileName     = speciesCSVFileName.split('\\')[-1]
    
    # Iterate through the folder
    for (dirpath, dirnames, filenames) in os.walk(CSVFolder):
        for filename in filenames:
            if (filename == speciesCSVFileName):
                return True, os.path.join(dirpath, filename)
        
    # return -1 on fail
    return False, speciesCSVFileName

In [24]:
def getSpeciesMetrics(speciesMatchesDF):
    Document       = []
    Tagged         = []
    Extracted      = []
    TruePositive   = []
    FalsePositive  = []
    FalseNegative  = []
    Precision      = []
    Recall         = []
    F1             = []
    partialMatches = []

    for doc in speciesMatchesDF.Document.unique():
        
        Document.append(doc)
        
        ActualResults    = len(speciesMatchesDF[(speciesMatchesDF.Document==doc)& (speciesMatchesDF['Text'].notna())])
        
        PredictedResults = len(speciesMatchesDF[(speciesMatchesDF.Document==doc)& (speciesMatchesDF['Found_as'].notna())])
        
        Tagged.append(ActualResults)
        
        Extracted.append(PredictedResults)
        
        TP               = len(speciesMatchesDF[(speciesMatchesDF.Document==doc)& (speciesMatchesDF['Found_as'].notna()) & (speciesMatchesDF['Text'].notna())])
        
        TruePositive.append(TP)
        
        FalsePositive.append(len(speciesMatchesDF[(speciesMatchesDF.Document==doc)& speciesMatchesDF['Found_as'].notna() & speciesMatchesDF['Text'].isna()]))
        
        FalseNegative.append(len(speciesMatchesDF[(speciesMatchesDF.Document==doc)& speciesMatchesDF['Found_as'].isna() & speciesMatchesDF['Text'].notna()]))
        
        notTagged        = speciesMatchesDF[(speciesMatchesDF.Document ==doc) & speciesMatchesDF.Text.isna()]
        
        notExtracted     = speciesMatchesDF[(speciesMatchesDF.Document==doc) & speciesMatchesDF.Found_as.isna()]
        
        closeMatches     = 0
        
        for i in notTagged.index:
            
            for j in notExtracted.index:
                
                if notTagged.loc[i]['Start_Index']>=notExtracted.loc[j]['Start_Index'] and notTagged.loc[i]['End_Index']<=notExtracted.loc[j]['End_Index']:
                    closeMatches += 1
                    
        for i in notExtracted.index:
            
            for j in notTagged.index:
                
                if notExtracted.loc[i]['Start_Index']>=notTagged.loc[j]['Start_Index'] and notExtracted.loc[i]['End_Index']<=notTagged.loc[j]['End_Index']:
                    closeMatches += 1
                    
        partialMatches.append(closeMatches)

    Document.append('Total')
    
    ActualResults    = len(speciesMatchesDF[speciesMatchesDF['Text'].notna()])
    
    PredictedResults = len(speciesMatchesDF[speciesMatchesDF['Found_as'].notna()])
    
    Tagged.append(ActualResults)
    
    Extracted.append(PredictedResults)
    
    TP               =len(speciesMatchesDF[speciesMatchesDF['Found_as'].notna() & speciesMatchesDF['Text'].notna()])
    
    TruePositive.append(TP)
    
    FalsePositive.append(len(speciesMatchesDF[speciesMatchesDF['Found_as'].notna() & speciesMatchesDF['Text'].isna()]))
    
    FalseNegative.append(len(speciesMatchesDF[speciesMatchesDF['Found_as'].isna() & speciesMatchesDF['Text'].notna()]))
    
    partialMatches.append(sum(partialMatches))

    results                   = pd.DataFrame({'Document': Document, 'Tagged':Tagged, 'Extracted':Extracted, 'True Positives':TruePositive,
                                              'False Positives':FalsePositive, 'False Negatives':FalseNegative, 'Partial Matches':partialMatches,
                                             })
    
    results['Precision']      = np.round(results['True Positives']/results['Extracted'], 4)
    
    results['Recall']         = np.round(results['True Positives']/results['Tagged'], 4)
    
    results['F1']             = np.round(2* ((results['Precision']*results['Recall'])/(results['Precision']+results['Recall'])), 4)
    
    results['Adj. Precision'] = np.round((results['True Positives']+results['Partial Matches'])/results['Extracted'], 4)
    
    results['Adj. Recall']    = np.round((results['True Positives']+results['Partial Matches'])/results['Tagged'], 4)
    
    results['Adj. F1']        = np.round(2* ((results['Adj. Precision']*results['Adj. Recall'])/(results['Adj. Precision']+results['Adj. Recall'])), 4)

    return results

In [25]:
def getLocationsMetrics(locationsMatchesDF):
    
    Document       = []
    Tagged         = []
    Extracted      = []
    TruePositive   = []
    FalsePositive  = []
    FalseNegative  = []
    Precision      = []
    Recall         = []
    F1             = []
    partialMatches = []

    for doc in locationsMatchesDF.Document.unique():
        
        Document.append(doc)
        
        ActualResults    = len(locationsMatchesDF[(locationsMatchesDF.Document==doc)& (locationsMatchesDF['TaggedLocation'].notna())])
        
        PredictedResults = len(locationsMatchesDF[(locationsMatchesDF.Document==doc)& (locationsMatchesDF['ExtractedLocation'].notna())])
        
        Tagged.append(ActualResults)
        
        Extracted.append(PredictedResults)
        
        TP               = len(locationsMatchesDF[(locationsMatchesDF.Document==doc)& (locationsMatchesDF['ExtractedLocation'].notna()) & (locationsMatchesDF['TaggedLocation'].notna())])
        
        TruePositive.append(TP)
        
        FalsePositive.append(len(locationsMatchesDF[(locationsMatchesDF.Document==doc)& locationsMatchesDF['ExtractedLocation'].notna() & locationsMatchesDF['TaggedLocation'].isna()]))
        
        FalseNegative.append(len(locationsMatchesDF[(locationsMatchesDF.Document==doc)& locationsMatchesDF['ExtractedLocation'].isna() & locationsMatchesDF['TaggedLocation'].notna()]))
        
        notTagged        = locationsMatchesDF[(locationsMatchesDF.Document ==doc) & locationsMatchesDF.TaggedLocation.isna()]
        
        notExtracted     = locationsMatchesDF[(locationsMatchesDF.Document==doc) & locationsMatchesDF.ExtractedLocation.isna()]
        
        closeMatches     = 0
        
        for i in notTagged.index:
        
            for j in notExtracted.index:
        
                if notTagged.loc[i]['Start_Index']>=notExtracted.loc[j]['Start_Index'] and notTagged.loc[i]['End_Index']<=notExtracted.loc[j]['End_Index']:
                    closeMatches += 1
        
        for i in notExtracted.index:
        
            for j in notTagged.index:
        
                if notExtracted.loc[i]['Start_Index']>=notTagged.loc[j]['Start_Index'] and notExtracted.loc[i]['End_Index']<=notTagged.loc[j]['End_Index']:
                    closeMatches += 1
        
        partialMatches.append(closeMatches)

    Document.append('Total')
    
    ActualResults    = len(locationsMatchesDF[locationsMatchesDF['TaggedLocation'].notna()])
    
    PredictedResults = len(locationsMatchesDF[locationsMatchesDF['ExtractedLocation'].notna()])
    
    Tagged.append(ActualResults)
    
    Extracted.append(PredictedResults)
    
    TP               = len(locationsMatchesDF[locationsMatchesDF['ExtractedLocation'].notna() & locationsMatchesDF['TaggedLocation'].notna()])
    
    TruePositive.append(TP)
    
    FalsePositive.append(len(locationsMatchesDF[locationsMatchesDF['ExtractedLocation'].notna() & locationsMatchesDF['TaggedLocation'].isna()]))
    
    FalseNegative.append(len(locationsMatchesDF[locationsMatchesDF['ExtractedLocation'].isna() & locationsMatchesDF['TaggedLocation'].notna()]))
    
    partialMatches.append(sum(partialMatches))

    results                   = pd.DataFrame({'Document': Document, 'Tagged':Tagged, 'Extracted':Extracted, 'True Positives':TruePositive,
                                              'False Positives':FalsePositive, 'False Negatives':FalseNegative, 'Partial Matches':partialMatches,
                                             })
    
    results['Precision']      = np.round(results['True Positives']/results['Extracted'], 4)
    
    results['Recall']         = np.round(results['True Positives']/results['Tagged'], 4)
    
    results['F1']             = np.round(2* ((results['Precision']*results['Recall'])/(results['Precision']+results['Recall'])), 4)
    
    results['Adj. Precision'] = np.round((results['True Positives']+results['Partial Matches'])/results['Extracted'], 4)
    
    results['Adj. Recall']    = np.round((results['True Positives']+results['Partial Matches'])/results['Tagged'], 4)
    
    results['Adj. F1']        = np.round(2* ((results['Adj. Precision']*results['Adj. Recall'])/(results['Adj. Precision']+results['Adj. Recall'])), 4)

    return results

In [26]:
def populate_taggedlocations_df(df, gazetters, filePairs):
    places           = []

    # Gazetter separation
    nzGaz            = gazetters[0]
    nzGazAntarctica  = gazetters[1]
    scarGlobalNames  = gazetters[2]
    scarNzNames      = gazetters[3]
    geoNamesAnt      = gazetters[4]
    geoNamesNZ       = gazetters[5]
    geoNamesFiltered = gazetters[6]
    
    # Create columns on the dataframes
    data               = []
    columns            = ['id', 'Location']
    toConCat           = pd.DataFrame(data=data,columns=columns)
    df                 = pd.concat([df,toConCat], axis=1)

    # We need to do a stupid iteration here because the list comprehension doesn't grab all the rows
    # Iterate through locations
    
    i = 0
    for row in df.index:
        
        # Give them all a unique id to later be used for indexing
        df.at[row, 'id'] = i
        i += 1
        
        # Set some booleans first up
        df.at[row, 'inNZ']          = False
        df.at[row, 'inAntarctica']  = False
        df.at[row, 'exactMatch']    = False
            
        # Get location and leading four letters
        location = str(df.loc[row, 'TaggedLocation'])
        
        # Add place but drop leading words 'The'/'the' and replace leading words 'Mt.'/'mt' with 'Mount'
        if location.startswith('The '):
            df.at[row, 'Location'] = location[4:]
            
        elif location.startswith('the '):
            df.at[row, 'Location'] = location[4:]
            
        elif location.startswith('Mt.'):
            df.at[row, 'Location'] = location[4:]
        
        elif location.startswith('Mt '):
            df.at[row, 'Location'] = location[3:]

        # Check possible locations in gazetteers for exact matches
        if location in nzGazAntarctica:
            df.at[row, 'NZGazAnt']    = True
            df.at[row, 'inNZ'    ]    = True
            df.at[row, 'inAntarctica']= True
        else:
            df.at[row, 'NZGazAnt']    = False
            
        if location in nzGaz:
            df.at[row, 'NZGaz']       = True
            df.at[row, 'inNZ' ]       = True
            df.at[row, 'exactMatch']  = True
        else:
            df.at[row, 'NZGaz']       = False
            
        if location in scarNzNames:
            df.at[row, 'ScarNZ']      = True
        else:
            df.at[row, 'ScarNZ']      = False
                 
        if location in scarGlobalNames:
            df.at[row, 'ScarGlobal'  ]= True
            df.at[row, 'inAntarctica']= True
            df.at[row, 'exactMatch']  = True
        else:
            df.at[row, 'ScarGlobal']  = False

        if location in geoNamesNZ:
            df.at[row, 'GeoNamesNZ']  = True
            df.at[row, 'inNZ'      ]  = True
            df.at[row, 'inAntarctica']= True
        else:
            df.at[row, 'GeoNamesNZ']  = False
                 
        if location in geoNamesAnt:
            df.at[row, 'GeoNamesAnt'] = True
        else:
            df.at[row, 'GeoNamesAnt'] = False
                 
        if location in geoNamesFiltered:
            df.at[row, 'GeoNames']    = True
            df.at[row, 'exactMatch']  = True
        else:
            df.at[row, 'GeoNames']    = False

    # filter locations by those not found in Antarctica or New Zealand gazeteers 
    df2                = df[(df['inAntarctica'] == False) & (df['inNZ']== False)].copy()
    df2.reset_index(inplace=True, drop=True)
    
    # look through these remaining locations (including those found only in GeoNames) for close matches 
    # eg. McMurdo Dry Valley v McMurdo Dry Valley or for partial matches eg. Ross Sea Region == Ross Sea
    for row in df2.index:
        
        # This section is for close matches
        # Set to not found
        df2.at[row, 'Close_Match'] = False
        
        # Then iterate for close matches on the location as a string
        location = str(df2.loc[row, 'Location'])
            
        # In Antarctica
        matches = difflib.get_close_matches(location, nzGazAntarctica, cutoff = 0.9)
        if len(matches) > 0:
            
            # If more than zero, add best match to dataframe
            df2.at[row, 'Close_Match_NZGazAnt']   = True #matches[0]
            df2.at[row, 'Close_Match']            = True
            
        # In NZ
        matches = difflib.get_close_matches(location, nzGaz,           cutoff = 0.9)
        if len(matches) > 0:
            
            # If more than zero, add best match to dataframe
            df2.at[row, 'Close_Match_NZGaz']      = True #matches[0]
            df2.at[row, 'Close_Match']            = True
        
        # In SCARNZ
        matches = difflib.get_close_matches(location, scarNzNames,     cutoff = 0.9)
        if len(matches) > 0:
            
            # If more than zero, add best match to dataframe
            df2.at[row, 'Close_Match_ScarNZ']     = True #matches[0]
            df2.at[row, 'Close_Match']            = True
 
        # In ScarGlobal
        matches = difflib.get_close_matches(location, scarGlobalNames, cutoff = 0.9)
        if len(matches) > 0:
            
            # If more than zero, add best match to dataframe
            df2.at[row, 'Close_Match_ScarGlobal'] = True #matches[0]
            df2.at[row, 'Close_Match']            = True
    
        # In GeoNamesNZ
        matches = difflib.get_close_matches(location, geoNamesNZ,     cutoff = 0.9)
        if len(matches) > 0:
            
            # If more than zero, add best match to dataframe
            df2.at[row, 'Close_Match_GeoNamesNZ'] = True #matches[0]
            df2.at[row, 'Close_Match']            = True
            
        # In GeoNamesAntarctica
        matches = difflib.get_close_matches(location, geoNamesAnt,    cutoff = 0.9)
        if len(matches) > 0:
            
            # If more than zero, add best match to dataframe
            df2.at[row, 'Close_Match_GeoNamesAnt'] = True #matches[0] 
            df2.at[row, 'Close_Match']             = True
    
        # This section is for partial matches
        # Set the not found boolean
        df2.at[row, 'PartialMatch'] = False
        
        # Get the tokenised document for that row
        fileString = str(df2.loc[row, 'Document'])
        doc        = getTokenisedDocument(fileString.replace('.txt', ''), filePairs)
    
        # Get the location, start and end tokens
        startToken = df2.loc[row, 'Start_Token']
        endToken   = df2.loc[row, 'End_Token']
        
        # Check the partial matches
        result = getBiggestSubStringMatch(doc,  startToken, endToken, nzGazAntartica)
        if not result == 'NaN':
            print(result)
            df2.at[row, 'PartialMatch_NZGazAnt']    = True #result
            df2.at[row, 'PartialMatch']             = True

        result = getBiggestSubStringMatch(doc, startToken, endToken, nzGaz)
        if not result == 'NaN':
            print(result)
            df2.at[row, 'PartialMatch_NZGaz']       = True #result
            df2.at[row, 'PartialMatch']             = True
            
        result = getBiggestSubStringMatch(doc,  startToken, endToken, scarNzNames)
        if not result == 'NaN':
            print(result)
            df2.at[row, 'PartialMatch_ScarNZ']      = True #result
            df2.at[row, 'PartialMatch']             = True

        result = getBiggestSubStringMatch(doc,  startToken, endToken, scarGlobalNames)
        if not result == 'NaN':
            print(result)
            df2.at[row, 'PartialMatch_ScarGlobal']  = True #result
            df2.at[row, 'PartialMatch']             = True
            
        result = getBiggestSubStringMatch(doc,  startToken, endToken, geoNamesNZ)
        if not result == 'NaN':
            print(result)
            df2.at[row, 'PartialMatch_GeoNamesNZ']  = True #result
            df2.at[row, 'PartialMatch']             = True

        result = getBiggestSubStringMatch(doc,  startToken, endToken, geoNamesAnt)
        if not result == 'NaN':
            print(result)
            df2.at[row, 'PartialMatch_GeoNamesAnt'] = True #result
            df2.at[row, 'PartialMatch']             = True
            
    # merge the filtered dataframe (now with close and partial matches) with the unfiltered dataframe
    df_unified                 = df.merge(df2, how = 'left')
    
    # redo the inNZ,inAntarctica and Found columns to include close and partial matches
    df_unified.drop(columns    = ['inNZ', 'inAntarctica'], inplace = True)
    
    df_unified['inNZ']         = (df_unified.NZGaz 
                                | df_unified.NZGazAnt 
                                | df_unified.GeoNamesNZ 
                                | df_unified.Close_Match_NZGaz 
                                | df_unified.Close_Match_NZGazAnt 
                                | df_unified.Close_Match_GeoNamesNZ 
                                | df_unified.PartialMatch_NZGaz 
                                | df_unified.PartialMatch_NZGazAnt 
                                | df_unified.PartialMatch_GeoNamesNZ)
    
    df_unified['inAntarctica'] = (df_unified.ScarGlobal 
                                | df_unified.NZGazAnt 
                                | df_unified.GeoNamesAnt 
                                | df_unified.Close_Match_ScarGlobal 
                                | df_unified.Close_Match_NZGazAnt 
                                | df_unified.Close_Match_GeoNamesAnt 
                                | df_unified.PartialMatch_ScarGlobal 
                                | df_unified.PartialMatch_NZGazAnt 
                                | df_unified.PartialMatch_GeoNamesAnt)
    
    df_unified['Found']        = (df_unified.exactMatch 
                                | df_unified.Close_Match
                                | df_unified.PartialMatch)
    
    return df_unified[df_unified.Found==True]

In [27]:
# This function gets the annotated text file and tokenises it
def getTokenisedDocument(document, filePairs):

    # Find text file in filePairs
    document  = filterFilePairs(fileCount, document, filePairs)
    
    # Filter to .txt
    txtFile   = [file for file in filePair if file.endswith('.txt')][0]
    
    # Open .txt
    file      = open(txtFile, encoding = "utf-8")
    plainText = file.read()
    file.close()
    
    # tokenize the .txt file
    doc       = nlp_lg(plainText)
    
    # Return
    return doc

In [28]:
def getBiggestSubStringMatch(document, fullStart, fullEnd, gazetteer):
       
    subLength = fullEnd - fullStart - 1

    while subLength > 1:

        subStart = fullStart

        subEnd   = fullStart + subLength

        while subEnd <= fullEnd:

            subString = document[subStart:subEnd].text

            if subString in gazetteer:

                return subString

            subStart += 1

            subEnd   += 1

        subLength -= 1
            
    return 'NaN'

In [29]:
def get_token_num_for_char(doc, start_idx):
    for i, token in enumerate(doc):
        if start_idx > token.idx:
            continue
        if start_idx == token.idx:
            return i
        if start_idx < token.idx:
            return i - 1

In [30]:
def filterFilePairs(count, file, filePairs):
    
    # If no specific file is passed
    if file == None:
        
        # Check if specific count required
        if count > 0:

            # Filter down to count
            return filePairs[:count]
        else:
                
            # Return all of them    
            return filePairs
        
    # Use a specific file
    for filePair in filePairs:

        # This gets the foler of the filepairs and matches it to the filename you put it
        fileName = [file for file in filePair if file.endswith('.txt')][0]
        fileName = fileName.split('\\')[-1]
        if fileName.replace('.txt', '') == file:

            return [filePair]

    print('ERROR: File not found')
    print('File requested: specificFile')

### Script

**Part A:** Gets the results of all the extracted files and annotated files and merges them

In [183]:
# This notebook is really intensive so I've done a lot of print outs in very ordered, specific formatting 
# to make it easier for the human brain to follow what is happening

# The output will clear on success, it is really only there incase you hit errors

# Any errors you do hit should only be problems with input data

# I have also broken it into two parts to help with running it

# God I wish Python supported multi line comments

# Working with all the files takes a ball-bustingly long time, so I've built in filters:

# To work with a specific file, change this value to the fileName of the PDF 
# This should match the folder the files are sitting in
# Example: 10.1007_BF00238925.pdf
specificFile    = 'Archer2017_Article_EndolithicMicrobialDiversityIn'

# To work with all files, set this to zero
fileCount       = 0

# Load source files
print('STAGE: Find source files')
srcFolder       = 'ExtractedAnnotatedData\\'
filePairs       = listAllFilePairs(srcFolder)

# Filter to file count or file name
print('STAGE: Filter files')
filePairs       = filterFilePairs(fileCount, specificFile, filePairs)

# Create data frames to append stuff to
print('STAGE: Create metrics dataframes')
allSpecMatches           = pd.DataFrame()
allLocMatches            = pd.DataFrame()
allRelAnnotated          = pd.DataFrame()
allPossibleRelationships = pd.DataFrame()

# Iterate through selected files
print ('STAGE: Iterate files')
for filePair in filePairs:
    print('')
    
    # Split types
    annotationsFile = [file for file in filePair if file.endswith('.json')][0]
    txtFile         = [file for file in filePair if file.endswith('.txt')][0]
    fileName        = txtFile.split('\\')[-1]
    folder          = '\\'.join(txtFile.split('\\')[:-2])
    print('FILE:    ' + fileName)
    print('FOLDER:  ' + folder)
    
    # Pass files tuple to extract species, locations and relations from the annotations
    print('COMMAND: Get annotated entities and relations')
    speciesAnnotated, locationsAnnotated, relationsAnnotated = getEntitiesandRelations(annotationsFile, txtFile)
    print('RESULT:  OK')
    
    # Load the Extracted Species CSV file
    print('COMMAND: Find extracted species CSV')
    result, csv = findExtractedCSV(txtFile, 'ExtractedSpecies\\', '-Species.csv')
    
    if result == False:
        print('RESULT:  Could not find Extracted Species CSV for ' + fileName + ' as below')
        print('         ' + csv)
    else:
        print('RESULT:  Found extracted Species CSV for ' + fileName + ' as below')
        print('         ' + csv)
    
    # Parse the csv
    print('COMMAND: Parse extraction data')
    speciesExtracted              = pd.read_csv(csv,index_col=0 )
    speciesExtracted              = speciesExtracted[['Found as', 'Full name', 'Start', 'End','PositionOfFirstToken']]
    speciesExtracted.columns      = ['Found_as', 'Full_name', 'Start_Token', 'End_Token', 'Start_Index']
    speciesExtracted['End_Index'] = [len(speciesExtracted.loc[X]['Found_as'])+speciesExtracted.loc[X]['Start_Index'] for X in speciesExtracted.index]
    print('RESULT:  OK')
    
    # Match Species from annotated data to extracted data
    print('COMMAND: Merge extracted and annotated species data')
    matchesSpecies                = pd.merge(speciesExtracted, speciesAnnotated, how = 'outer')
    matchesSpecies['Match']       = matchesSpecies.Found_as == matchesSpecies.Text
    matchesSpecies['Document']    = fileName   
    print('RESULT:  OK')
    
    # Load the Extracted Locations CSV file
    print('COMMAND: Find extracted locations csv')
    result, csv = findExtractedCSV(txtFile, 'ExtractedLocations\\', '-Locations.csv')
    
    if result == False:
        print('RESULT:  Could not find extracted locations CSV for ' + fileName + ' as below')
        print('         ' + csv)
    else:
        print('RESULT:  Found extracted locations CSV for ' + fileName + ' as below')
        print('         ' + csv)
        
    # Parse the CSV
    print('COMMAND: Parse extracted data')
    locationsExtracted              = pd.read_csv(csv,index_col=0 )  
    print('RESULT:  OK')
    
    # Match locations from annotated data to extracted data
    print('COMMAND: Match extracted and annotated location data')
    matchesLocations                = pd.merge(locationsExtracted, locationsAnnotated, how='outer')
    matchesLocations.rename(columns = {'Location':'ExtractedLocation','Text':'TaggedLocation'}, inplace=True)
    matchesLocations['Document']    = fileName
    print('RESULT:  OK')
    
    # Append to over all dataframes
    print('COMMAND: Append to over all metrics dataframe')
    allSpecMatches                  = pd.concat([allSpecMatches,  matchesSpecies])
    allLocMatches                   = pd.concat([allLocMatches,   matchesLocations])
    allRelAnnotated                 = pd.concat([allRelAnnotated, relationsAnnotated])
    print('RESULT:  OK')

    # Reset indexes
    print('COMMAND: Reset indexes')
    allSpecMatches.reset_index (drop=True, inplace=True)
    allLocMatches.reset_index  (drop=True, inplace=True)
    allRelAnnotated.reset_index(drop=True, inplace=True)  
    print('RESULT:  OK')
    
    # Start identifying relationships
    print('COMMAND: Merge species and locations to possible relationships')
    possibleRelationships = pd.merge(speciesAnnotated.assign(key   = 0), 
                                     locationsAnnotated.assign(key = 0),
                                     on       = ['key','Document'], 
                                     suffixes = ('_Species', 
                                                 '_Location')).drop('key', axis = 1)

    possibleRelationships.rename(columns = {'Text_Species' :'Species', 
                                            'Text_Location':'Location',
                                            'AnnTxtFile_Species':'AnnTxtFile'}, 
                                 inplace = True) 
    print('RESULT:  OK')

    # Create a dataframe of the actual relationships in the file
    print('COMMAND: Create dataframe of actual relationships')
    oneDocRelationships                     = pd.merge(possibleRelationships, relationsAnnotated, how = 'left')
    oneDocRelationships.Tagged_Relationship.fillna(0, inplace=True)
    oneDocRelationships.Tagged_Relationship = oneDocRelationships.Tagged_Relationship.astype('int8')
    print('RESULT:  OK')
    
    # Create the out folder if it's not there
    outFolder = 'Metrics\\Relationships\\DocumentSpecific\\' + folder.split('\\')[-1]
    if not os.path.exists(outFolder):
        os.makedirs(outFolder)   
    
    # Out the single documents relationships
    print('COMMAND: Out oneDocRelationships.csv')
    oneDocRelationships.to_csv(outFolder + '\\' + fileName + '.csv')
    print('RESULT:  OK')

    # Concat bot down to all possible relationships
    print('COMMAND: Concat all relationship dataframes')
    allPossibleRelationships = pd.concat([allPossibleRelationships, oneDocRelationships])
    allPossibleRelationships.reset_index(drop=True, inplace=True)
    print('RESULT:  OK')
    
# Print to allRelationships CSVs
print('COMMAND: Out allPossibleRelationships.csv')
allPossibleRelationships.to_csv('Metrics\\Relationships\\allPossible.csv')
print('RESULT:  OK')

# Clear output on success
clear_output(wait = True)
print("Success")

Success


**Part B:** Generates the metrics from all the data

In [188]:
# This will also clear the output on success

# Get species matches
print('COMMAND: Get the species matches' )
speciesMetrics = getSpeciesMetrics(allSpecMatches)
print('RESULT:  OK')

# Get location matches
print('COMMAND: Get the location matches')
locationMetrics = getLocationsMetrics(allLocMatches)
print('RESULT:  OK')

# Format allLocMatches
asStrings = ['ExtractedLocation', 
             'Original Tokenised Text',
             'Sentence',
             'TaggedLocation', 
             'Document']

asInts    = ['Start_Token',
             'End_Token', 
             'Start_Index', 
             'End_Index']

asBool    = ['NZGazAnt',
             'NZGaz', 
             'ScarNZ', 
             'ScarGlobal', 
             'GeoNamesNZ', 
             'GeoNamesAnt',
             'GeoNames',
             'exactMatch', 
             'Close_Match_NZGazAnt', 
             'Close_Match_NZGaz',
             'Close_Match_ScarNZ', 
             'Close_Match_ScarGlobal',
             'Close_Match_GeoNamesNZ', 
             'Close_Match_GeoNamesAnt', 
             'Close_Match',
             'PartialMatch_NZGazAnt', 
             'PartialMatch_NZGaz', 
             'PartialMatch_ScarNZ',
             'PartialMatch_ScarGlobal', 
             'PartialMatch_GeoNamesNZ',
             'PartialMatch_GeoNamesAnt', 
             'PartialMatch', 
             'inNZ', 
             'inAntarctica',
             'Found']

print('COMMAND: Format allLocMatches dataframe')
for col in asStrings:
    allLocMatches[col] = allLocMatches[col].astype(str)
    
for col in asBool:
    allLocMatches[col] = allLocMatches[col].astype(bool)
print('RESULT:  OK')

# Get token numbers for spcecies matches
print('COMMAND: Get token numbers')
for row in allSpecMatches[allSpecMatches['Found_as'].isna()].index:
    allSpecMatches.at[row, 'Found_as']    = allSpecMatches.loc[row].Text
print('RESULT:  OK')

# Get unmatched locations
print('COMMAND: Get unmatched locations')
unmatchedLocations = allLocMatches[allLocMatches.ExtractedLocation == 'nan']
print("RESULT:  OK")

# Load gazetters
print('COMMAND: Load gazetters')
nzGaz            = list(np.load('NPYs\\nzgazetteer.npy',      allow_pickle=True))
print('         nzgaz            OK')
nzGazAntartica   = list(np.load('NPYs\\nzgazAntartica.npy',   allow_pickle=True))
print('         nzgazAntartica   OK')
scarGlobalNames  = list(np.load('NPYs\\SCARGlobalnames.npy',  allow_pickle=True))
print('         SCARGlobalnames  OK')
scarNzNames      = list(np.load('NPYs\\SCARNZnames.npy',      allow_pickle=True))
print('         SCARNZnames      OK')
geoNamesAnt      = list(np.load('NPYs\\GeoNamesAnt.npy',      allow_pickle=True))
print('         GeoNamesAnt      OK')
geoNamesNZ       = list(np.load('NPYs\\GeoNamesNZ.npy',       allow_pickle=True))
print('         GeoNamesNZ       OK')
geoNamesFiltered = list(np.load('NPYs\\GeoNamesFiltered.npy', allow_pickle=True))
print('         GeoNamesFiltered OK')
print('         Create list arg for function calls')
gazetters       = [nzGaz, nzGazAntartica, scarGlobalNames, scarNzNames, geoNamesAnt, geoNamesNZ, geoNamesFiltered]
print('RESULT:  OK')

# Filter unmatched locations to actualy unmatched locations
print('COMMAND: Apply unmatched locations')
unmatchedLocations = populate_taggedlocations_df(unmatchedLocations, gazetters, filePairs) 
print('RESULT:  OK')

# Apply to matched locations
print('COMMAND: Apply to as filter all matched locations dataframe')
allLocMatches.update(unmatchedLocations, overwrite=True)
print('RESULT:  OK')

# Convert columns to typeof Object
print('COMMAND: Convert columns')
for col in asBool:
    unmatchedLocations[col] = unmatchedLocations[col].astype(object)    
    allLocMatches[col]      = allLocMatches[col].astype(object)
print('RESULT:  OK')

# Create tagged location dataframe
print('COMMAND: Create tagged location dataframe from location matched')
TaggedLocations = allLocMatches[allLocMatches.TaggedLocation.notna()]
print('RESULT:  OK')

# Clear output on success
clear_output(wait = True)
print("Success")

Success


### Print Outs

In [189]:
# This data frame has all the extracted species matched to all the locations in the annotated data sets
allSpecMatches

Unnamed: 0,Found_as,Full_name,Start_Token,End_Token,Start_Index,End_Index,Text,classId,Sentence,inAbstract300,inAbstract500,max_TFISF,Sent_Start,Document,AnnTxtFile,Match
0,Hemichloris antarctica,Hemichloris antarctica,2908,2910,15785,15807,,,,,,,,Archer2017_Article_EndolithicMicrobialDiversit...,,False
1,Hemichloris antarctica,Hemichloris antarctica,2908,2910,15785,15807,,,,,,,,Archer2017_Article_EndolithicMicrobialDiversit...,,False
2,Hemichloris antarctica,Hemichloris antarctica,2908,2910,15785,15807,,,,,,,,Archer2017_Article_EndolithicMicrobialDiversit...,,False
3,Hemichloris antarctica,Hemichloris antarctica,2908,2910,15785,15807,,,,,,,,Archer2017_Article_EndolithicMicrobialDiversit...,,False
4,Hemichloris antarctica,Hemichloris antarctica,2908,2910,15785,15807,,,,,,,,Archer2017_Article_EndolithicMicrobialDiversit...,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
800,lichen,,5759,5760,32059,32064,lichen,e_6,"(Lichen, associations, were, demonstrated, in,...",1.0,1.0,4.527491,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,False
801,lichen,,5759,5760,32059,32064,lichen,e_6,"(Lichen, associations, were, demonstrated, in,...",1.0,1.0,4.527491,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,False
802,bacterial,,5813,5814,32354,32362,bacterial,e_6,"(Another, interesting, point, was, that, the, ...",1.0,1.0,1.907070,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,False
803,lichen,,5838,5839,32503,32508,lichen,e_6,"(These, findings, suggest, that, the, traditio...",1.0,1.0,2.263745,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,False


In [190]:
# This dataframe has all the extracted locations matched to all the locations in the annotated data sets
allLocMatches

Unnamed: 0,Found as,Full name,InDefinitiveList,Start,End,PositionOfFirstToken,Sentence_species,InReferenceList,ExtractedLocation,Original Tokenised Text,...,Found,TaggedLocation,classId,Sentence,inAbstract300,inAbstract500,max_TFISF,Sent_Start,Document,AnnTxtFile
0,Hemichloris antarctica,Hemichloris antarctica,True,2908.0,2910.0,15785.0,Free-living algal cells resembling Hemichloris...,True,,,...,True,McMurdo Dry Valleys,e_2,\n\n\n\n\nawkaYNS0KsDlin9nBwoxLbhE6rr0-Archer2...,1.0,1.0,3.228826,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...
1,Diplosphaera,Diplosphaera,False,3949.0,3950.0,21628.0,¢ Chasmoendolithic microbial community from Mi...,False,,,...,True,Antarctica,e_2,\n\n\n\n\nawkaYNS0KsDlin9nBwoxLbhE6rr0-Archer2...,1.0,1.0,3.362358,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...
2,Hemichloris antarctica,Hemichloris antarctica,True,3989.0,3991.0,21838.0,¢ Chasmoendolithic microbial community from Mi...,True,,,...,True,Berlin,e_2,Thomas S. Niederberger” - S. Craig Cary”* - Ka...,0.0,0.0,5.308268,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...
3,Trebouxia,Trebouxia,True,4003.0,4004.0,21918.0,Whilst Trebouxia-like algal cells were observe...,False,,,...,True,Heidelberg,e_2,Thomas S. Niederberger” - S. Craig Cary”* - Ka...,0.0,0.0,5.308268,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...
4,Lecidea,Lecidea,False,4046.0,4047.0,22161.0,Lichen-forming ascomycetes (Lecidea genus) wer...,False,,,...,True,Antarctica,e_2,Abstract Cryptic microbial communities develop...,1.0,1.0,3.362358,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,,,,,,,,,,,...,True,McKelvey,e_2,Since the alphaproteobacterial OTUs included d...,0.0,0.0,2.263745,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...
763,,,,,,,,,,,...,True,Miers,e_2,Since the alphaproteobacterial OTUs included d...,0.0,0.0,2.130214,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...
764,,,,,,,,,,,...,True,University Valley,e_2,Lichen associations were demonstrated in Unive...,0.0,0.0,1.811760,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...
765,,,,,,,,,,,...,True,Antarctica,e_2,Acknowledgments The authors wish to acknowledg...,1.0,1.0,6.724715,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...


In [148]:
# This data frame has all the relationships found in the manually annotated pdfs
allRelAnnotated

Unnamed: 0,Species,Start_Token_Species,End_Token_Species,Start_Index_Species,End_Index_Species,Location,Start_Token_Location,End_Token_Location,Start_Index_Location,End_Index_Location,Tagged_Relationship
0,Hemichloris antarctic,2885,2887,15742,15763,University Valle,2819.0,2821.0,15357,15373,1
1,Scytonematacea,3362,3363,18312,18326,McKelvey Valle,3371.0,3373.0,18367,18381,1
2,Scytonematacea,3362,3363,18312,18326,Miers Valle,3382.0,3384.0,18428,18439,1
3,Scytonematacea,3362,3363,18312,18326,University Valle,3397.0,3399.0,18499,18515,1
4,Actinobacteri,3624,3625,19809,19822,Miers Valle,3640.0,3642.0,19922,19933,1
5,Bacteriodete,3626,3627,19825,19837,Miers Valle,3640.0,3642.0,19922,19933,1
6,Chloroflex,3628,3629,19843,19853,Miers Valle,3640.0,3642.0,19922,19933,1
7,Lentisphaera,3727,3728,20460,20472,University Valle,3731.0,3733.0,20486,20502,1
8,Hemichloris antarctic,3948,3950,21706,21727,University Valle,3924.0,3926.0,21576,21592,1
9,Hemichloris antarctic,3948,3950,21706,21727,McKelve,3956.0,3957.0,21751,21758,1


In [200]:
# This shows the CSV of the relationships in the last pdf to be accessed
oneDocRelationships.loc[oneDocRelationships['Tagged_Relationship'] == 1]

Unnamed: 0,Species,Start_Token_Species,End_Token_Species,Start_Index_Species,End_Index_Species,classId_Species,Sentence_Species,inAbstract300_Species,inAbstract500_Species,max_TFISF_Species,...,Start_Index_Location,End_Index_Location,classId_Location,Sentence_Location,inAbstract300_Location,inAbstract500_Location,max_TFISF_Location,Sent_Start_Location,AnnTxtFile_Location,Tagged_Relationship
4867,Hemichloris antarctica,2885,2887,15742,15763,e_1,"(Free, -, living, algal, cells, resembling, He...",0,0,4.209655,...,15357,15373,e_2,"(For, University, Valley, samples, ,, the, dif...",0,0,1.81176,0,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1
6878,Scytonemataceae,3362,3363,18312,18326,e_1,"(The, Scytonemataceae, (, Nostocales, ), were,...",0,0,4.615121,...,18367,18381,e_2,"(The, Scytonemataceae, (, Nostocales, ), were,...",0,0,4.188734,0,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1
6879,Scytonemataceae,3362,3363,18312,18326,e_1,"(The, Scytonemataceae, (, Nostocales, ), were,...",0,0,4.615121,...,18428,18439,e_2,"(The, Scytonemataceae, (, Nostocales, ), were,...",0,0,4.188734,0,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1
6880,Scytonemataceae,3362,3363,18312,18326,e_1,"(The, Scytonemataceae, (, Nostocales, ), were,...",0,0,4.615121,...,18499,18515,e_2,"(The, Scytonemataceae, (, Nostocales, ), were,...",0,0,4.188734,0,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1
6881,Scytonemataceae,3362,3363,18312,18326,e_1,"(The, Scytonemataceae, (, Nostocales, ), were,...",0,0,4.615121,...,18499,18515,e_2,"(The, Scytonemataceae, (, Nostocales, ), were,...",0,0,4.188734,0,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1
7839,Actinobacteria,3624,3625,19809,19822,e_1,"(The, Actinobacteria, ,, Bacteriodetes, and, C...",0,0,4.209655,...,19922,19933,e_2,"(The, Actinobacteria, ,, Bacteriodetes, and, C...",0,0,2.130214,0,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1
7957,Bacteriodetes,3626,3627,19825,19837,e_1,"(The, Actinobacteria, ,, Bacteriodetes, and, C...",0,0,4.615121,...,19922,19933,e_2,"(The, Actinobacteria, ,, Bacteriodetes, and, C...",0,0,2.130214,0,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1
8075,Chloroflexi,3628,3629,19843,19853,e_1,"(The, Actinobacteria, ,, Bacteriodetes, and, C...",0,0,4.615121,...,19922,19933,e_2,"(The, Actinobacteria, ,, Bacteriodetes, and, C...",0,0,2.130214,0,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1
8674,Lentisphaerae,3727,3728,20460,20472,e_1,"(Other, notable, differences, were, the, relat...",0,0,4.615121,...,20486,20502,e_2,"(Other, notable, differences, were, the, relat...",0,0,1.81176,0,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1
9623,Hemichloris antarctica,3948,3950,21706,21727,e_1,"(Chasmoendolithic, microbial, community, from,...",0,0,4.209655,...,21576,21592,e_2,"(Chasmoendolithic, microbial, community, from,...",0,0,3.62352,0,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1


In [201]:
# This shows all the possible annotated relationships
allPossibleRelationships.loc[allPossibleRelationships['Tagged_Relationship'] == 1]

Unnamed: 0,Species,Start_Token_Species,End_Token_Species,Start_Index_Species,End_Index_Species,classId_Species,Sentence_Species,inAbstract300_Species,inAbstract500_Species,max_TFISF_Species,...,Start_Index_Location,End_Index_Location,classId_Location,Sentence_Location,inAbstract300_Location,inAbstract500_Location,max_TFISF_Location,Sent_Start_Location,AnnTxtFile_Location,Tagged_Relationship
4867,Hemichloris antarctica,2885,2887,15742,15763,e_1,"(Free, -, living, algal, cells, resembling, He...",0,0,4.209655,...,15357,15373,e_2,"(For, University, Valley, samples, ,, the, dif...",0,0,1.81176,0,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1
6878,Scytonemataceae,3362,3363,18312,18326,e_1,"(The, Scytonemataceae, (, Nostocales, ), were,...",0,0,4.615121,...,18367,18381,e_2,"(The, Scytonemataceae, (, Nostocales, ), were,...",0,0,4.188734,0,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1
6879,Scytonemataceae,3362,3363,18312,18326,e_1,"(The, Scytonemataceae, (, Nostocales, ), were,...",0,0,4.615121,...,18428,18439,e_2,"(The, Scytonemataceae, (, Nostocales, ), were,...",0,0,4.188734,0,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1
6880,Scytonemataceae,3362,3363,18312,18326,e_1,"(The, Scytonemataceae, (, Nostocales, ), were,...",0,0,4.615121,...,18499,18515,e_2,"(The, Scytonemataceae, (, Nostocales, ), were,...",0,0,4.188734,0,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1
6881,Scytonemataceae,3362,3363,18312,18326,e_1,"(The, Scytonemataceae, (, Nostocales, ), were,...",0,0,4.615121,...,18499,18515,e_2,"(The, Scytonemataceae, (, Nostocales, ), were,...",0,0,4.188734,0,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1
7839,Actinobacteria,3624,3625,19809,19822,e_1,"(The, Actinobacteria, ,, Bacteriodetes, and, C...",0,0,4.209655,...,19922,19933,e_2,"(The, Actinobacteria, ,, Bacteriodetes, and, C...",0,0,2.130214,0,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1
7957,Bacteriodetes,3626,3627,19825,19837,e_1,"(The, Actinobacteria, ,, Bacteriodetes, and, C...",0,0,4.615121,...,19922,19933,e_2,"(The, Actinobacteria, ,, Bacteriodetes, and, C...",0,0,2.130214,0,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1
8075,Chloroflexi,3628,3629,19843,19853,e_1,"(The, Actinobacteria, ,, Bacteriodetes, and, C...",0,0,4.615121,...,19922,19933,e_2,"(The, Actinobacteria, ,, Bacteriodetes, and, C...",0,0,2.130214,0,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1
8674,Lentisphaerae,3727,3728,20460,20472,e_1,"(Other, notable, differences, were, the, relat...",0,0,4.615121,...,20486,20502,e_2,"(Other, notable, differences, were, the, relat...",0,0,1.81176,0,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1
9623,Hemichloris antarctica,3948,3950,21706,21727,e_1,"(Chasmoendolithic, microbial, community, from,...",0,0,4.209655,...,21576,21592,e_2,"(Chasmoendolithic, microbial, community, from,...",0,0,3.62352,0,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1


In [194]:
# This is the metrics for species
speciesMetrics

Unnamed: 0,Document,Tagged,Extracted,True Positives,False Positives,False Negatives,Partial Matches,Precision,Recall,F1,Adj. Precision,Adj. Recall,Adj. F1
0,Archer2017_Article_EndolithicMicrobialDiversit...,156,649,0,649,156,59,0.0,0.0,,0.0909,0.3782,0.1466
1,Total,156,649,0,649,156,59,0.0,0.0,,0.0909,0.3782,0.1466


In [193]:
# This is the metrics for locations
locationMetrics

Unnamed: 0,Document,Tagged,Extracted,True Positives,False Positives,False Negatives,Partial Matches,Precision,Recall,F1,Adj. Precision,Adj. Recall,Adj. F1
0,Archer2017_Article_EndolithicMicrobialDiversit...,118,649,0,649,118,22,0.0,0.0,,0.0339,0.1864,0.0574
1,Total,118,649,0,649,118,22,0.0,0.0,,0.0339,0.1864,0.0574


In [192]:
# These are all the unmatched locations
unmatchedLocations

Unnamed: 0,Found as,Full name,InDefinitiveList,Start,End,PositionOfFirstToken,Sentence_species,InReferenceList,ExtractedLocation,Original Tokenised Text,...,inAbstract300,inAbstract500,max_TFISF,Sent_Start,Document,AnnTxtFile,id,Location,inNZ,inAntarctica
0,,,,,,,,,,,...,1.0,1.0,3.228826,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,0,,True,True
1,,,,,,,,,,,...,1.0,1.0,3.362358,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,1,,True,True
2,,,,,,,,,,,...,0.0,0.0,5.308268,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,2,,True,True
3,,,,,,,,,,,...,0.0,0.0,5.308268,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,3,,True,True
4,,,,,,,,,,,...,1.0,1.0,3.362358,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,4,,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,,,,,,,,,,,...,0.0,0.0,2.263745,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,113,,True,True
114,,,,,,,,,,,...,0.0,0.0,2.130214,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,114,,True,True
115,,,,,,,,,,,...,0.0,0.0,1.811760,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,115,,True,True
116,,,,,,,,,,,...,1.0,1.0,6.724715,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...,116,,True,True


In [191]:
# These are all the found locations that matched a tagged location
TaggedLocations

Unnamed: 0,Found as,Full name,InDefinitiveList,Start,End,PositionOfFirstToken,Sentence_species,InReferenceList,ExtractedLocation,Original Tokenised Text,...,Found,TaggedLocation,classId,Sentence,inAbstract300,inAbstract500,max_TFISF,Sent_Start,Document,AnnTxtFile
0,Hemichloris antarctica,Hemichloris antarctica,True,2908.0,2910.0,15785.0,Free-living algal cells resembling Hemichloris...,True,,,...,True,McMurdo Dry Valleys,e_2,\n\n\n\n\nawkaYNS0KsDlin9nBwoxLbhE6rr0-Archer2...,1.0,1.0,3.228826,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...
1,Diplosphaera,Diplosphaera,False,3949.0,3950.0,21628.0,¢ Chasmoendolithic microbial community from Mi...,False,,,...,True,Antarctica,e_2,\n\n\n\n\nawkaYNS0KsDlin9nBwoxLbhE6rr0-Archer2...,1.0,1.0,3.362358,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...
2,Hemichloris antarctica,Hemichloris antarctica,True,3989.0,3991.0,21838.0,¢ Chasmoendolithic microbial community from Mi...,True,,,...,True,Berlin,e_2,Thomas S. Niederberger” - S. Craig Cary”* - Ka...,0.0,0.0,5.308268,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...
3,Trebouxia,Trebouxia,True,4003.0,4004.0,21918.0,Whilst Trebouxia-like algal cells were observe...,False,,,...,True,Heidelberg,e_2,Thomas S. Niederberger” - S. Craig Cary”* - Ka...,0.0,0.0,5.308268,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...
4,Lecidea,Lecidea,False,4046.0,4047.0,22161.0,Lichen-forming ascomycetes (Lecidea genus) wer...,False,,,...,True,Antarctica,e_2,Abstract Cryptic microbial communities develop...,1.0,1.0,3.362358,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,,,,,,,,,,,...,True,McKelvey,e_2,Since the alphaproteobacterial OTUs included d...,0.0,0.0,2.263745,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...
763,,,,,,,,,,,...,True,Miers,e_2,Since the alphaproteobacterial OTUs included d...,0.0,0.0,2.130214,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...
764,,,,,,,,,,,...,True,University Valley,e_2,Lichen associations were demonstrated in Unive...,0.0,0.0,1.811760,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...
765,,,,,,,,,,,...,True,Antarctica,e_2,Acknowledgments The authors wish to acknowledg...,1.0,1.0,6.724715,0.0,Archer2017_Article_EndolithicMicrobialDiversit...,ExtractedAnnotatedData\OldSet\Archer2017_Artic...


End.