## Recognising Species and Locations from Extracted Text

### Imports

In [1]:
from IPython.core.display import display, HTML, Markdown as md
display(HTML("""<style>.container { width:80% !important; } p, ul {max-width:␣
,→40em;} .rendered_html table { margin-left: 0; } .output_subarea.output_png {␣
,→display: flex; justify-content: center;}</style>"""))

import pandas as pd
import numpy  as np
import os
import spacy
import re
import difflib
from   spacy         import displacy
from   collections   import Counter
from   spacy.matcher import Matcher

# Spacy Small
import en_core_web_sm
nlp =  spacy.load('en_core_web_sm')

# Spacy Medium
import en_core_web_md
nlp_md = spacy.load('en_core_web_md')

# Spacey Large
import en_core_web_lg
nlp_lg = spacy.load('en_core_web_lg')

### Functions

In [2]:
def find_locations(doc):
    places = []
    for x in doc.ents:
        if x.label_ == 'LOC':
            if doc[x.start].text == 'the' or doc[x.start].text == 'The':
                places.append([x.text[4:], x.text, x.start+1, x.end, doc[x.start+1].idx, doc[x.end-1].idx + len(doc[x.end-1]), x.sent])
            elif doc[x.start].text== 'Mt':
                places.append(['Mount' + x.text[2:], x.text, x.start+1, x.end, doc[x.start].idx, doc[x.end-1].idx + len(doc[x.end-1]), x.sent])
            elif doc[x.start].text== 'Mt.':
                places.append(['Mount' + x.text[3:], x.text, x.start+1, x.end, doc[x.start].idx, doc[x.end-1].idx + len(doc[x.end-1]), x.sent])              
            else:
                places.append([x.text, x. text, x.start, x.end, doc[x.start].idx, doc[x.end-1].idx + len(doc[x.end-1]), x.sent])
    
    df = pd.DataFrame(places,columns=['Location', 'Original Tokenised Text','Start_Token', 'End_Token','Start_Index','End_Index','Sentence']
             )
    # Check possible locations in gazetteers for exact matches
    df['NZGazAnt']     = [True if X in nzgazAntartica else False for X in df.Location ]
    df['NZGaz']        = [True if (df['NZGazAnt'] is True or X in nzgaz) else False for X in df.Location ]
    df['ScarNZ']       = [True if X in SCARNZnames else False for X in df.Location ]
    df['ScarGlobal']   = [True if X in SCARGlobalnames else False for X in df.Location]
    df['GeoNamesNZ']   = [True if X in GeoNamesNZ else False for X in df.Location]
    df['GeoNamesAnt']  = [True if X in GeoNamesAnt else False for X in df.Location]
    df['GeoNames']     = [True if (df['GeoNamesNZ'] is True or df['GeoNamesAnt'] is True or X in GeoNamesFiltered) else False for X in df.Location ]
    df['inNZ']         = df.NZGaz      | df.NZGazAnt   | df.GeoNamesNZ
    df['inAntarctica'] = df.ScarGlobal | df.NZGazAnt   | df.GeoNamesAnt
    df['exactMatch']   = df.NZGaz      | df.ScarGlobal | df.GeoNames
    
    
    # filter locations by those not found in Antarctica or New Zealand gazeteers 
    df2                = df[(df['inAntarctica'] == False) & (df['inNZ'] == False)].copy()
    

    # look through these remaining locations (including those found only in GeoNames) for close matches 
    # eg. McMurdo Dry Valley v McMurdo Dry Valley or for partial matches eg. Ross Sea Region == Ross Sea 
   
    df2['Close_Match_NZGazAnt']    =  [difflib.get_close_matches(X, nzgazAntartica, cutoff=0.9)[0]
                                                      if    len(difflib.get_close_matches(X, nzgazAntartica, cutoff=0.9))>0
                                                      else  np.nan 
                                                      for   X in df2['Location']] 
    
    df2['Close_Match_NZGaz']       =  [difflib.get_close_matches(X, nzgaz, cutoff=0.9)[0]
                                                      if    len(difflib.get_close_matches(X, nzgaz, cutoff=0.9))>0
                                                      else  np.nan 
                                                      for   X in df2['Location']] 
    
    df2['Close_Match_ScarNZ']      =  [difflib.get_close_matches(X, SCARNZnames, cutoff=0.9)[0] 
                                                      if    len(difflib.get_close_matches(X, SCARNZnames, cutoff=0.9))>0
                                                      else  np.nan 
                                                      for   X in df2['Location']] 
    
    df2['Close_Match_ScarGlobal']  =  [difflib.get_close_matches(X, SCARGlobalnames, cutoff=0.9)[0]
                                                      if   (X !='Antarctic' and len(difflib.get_close_matches(X, SCARGlobalnames, cutoff=0.9))>0)
                                                      else  np.nan 
                                                      for   X in df2['Location']] 
    
    df2['Close_Match_GeoNamesNZ']  = [difflib.get_close_matches(X, GeoNamesNZ, cutoff=0.9)[0] 
                                                      if    len(difflib.get_close_matches(X, GeoNamesNZ, cutoff=0.9))>0
                                                      else  np.nan 
                                                      for   X in df2['Location']] 
    
    df2['Close_Match_GeoNamesAnt'] =  [difflib.get_close_matches(X, GeoNamesAnt, cutoff=0.9)[0]
                                                      if   (X !='Antarctic' and len(difflib.get_close_matches(X, GeoNamesAnt, cutoff=0.9))>0)
                                                      else  np.nan 
                                                      for   X in df2['Location']]
    
    df2['Close_Match']             = (df2.Close_Match_NZGazAnt.notna() 
                                   | df2.Close_Match_NZGaz.notna() 
                                   | df2.Close_Match_ScarNZ.notna() 
                                   | df2.Close_Match_ScarGlobal.notna() 
                                   | df2.Close_Match_GeoNamesAnt.notna())
    
    df2['PartialMatch_NZGazAnt']    = getBiggestSubStringMatch(doc, df2, nzgazAntartica)
    df2['PartialMatch_NZGaz']       = getBiggestSubStringMatch(doc, df2, nzgaz)
    df2['PartialMatch_ScarNZ']      = getBiggestSubStringMatch(doc, df2, SCARNZnames)
    df2['PartialMatch_ScarGlobal']  = getBiggestSubStringMatch(doc, df2, SCARGlobalnames)
    df2['PartialMatch_GeoNamesNZ']  = getBiggestSubStringMatch(doc, df2, GeoNamesNZ)
    df2['PartialMatch_GeoNamesAnt'] = getBiggestSubStringMatch(doc, df2, GeoNamesAnt)
    df2['PartialMatch']             = (df2['PartialMatch_NZGazAnt'].notna() 
                                     | df2['PartialMatch_NZGaz'].notna() 
                                     | df2['PartialMatch_ScarNZ'].notna() 
                                     | df2['PartialMatch_ScarGlobal'].notna() 
                                     | df2['PartialMatch_GeoNamesAnt'].notna())
    
    # merge the filtered dataframe (now with close and partial matches) with the unfiltered dataframe
    df_unified = pd.merge(df, df2, how = 'left')
    
    # redo the inNZ,inAntarctica and Found columns to include close and partial matches
    df_unified.drop(columns    = ['inNZ', 'inAntarctica'], inplace = True)
    
    df_unified['inNZ']         = (df_unified.NZGaz 
                                | df_unified.NZGazAnt 
                                | df_unified.GeoNamesNZ 
                                | df_unified.Close_Match_NZGaz 
                                | df_unified.Close_Match_NZGazAnt 
                                | df_unified.Close_Match_GeoNamesNZ 
                                | df_unified.PartialMatch_NZGaz 
                                | df_unified.PartialMatch_NZGazAnt
                                | df_unified.PartialMatch_GeoNamesNZ)
    
    df_unified['inAntarctica'] = (df_unified.ScarGlobal 
                                | df_unified.NZGazAnt 
                                | df_unified.GeoNamesAnt 
                                | df_unified.Close_Match_ScarGlobal 
                                | df_unified.Close_Match_NZGazAnt 
                                | df_unified.Close_Match_GeoNamesAnt
                                | df_unified.PartialMatch_ScarGlobal 
                                | df_unified.PartialMatch_NZGazAnt 
                                | df_unified.PartialMatch_GeoNamesAnt)
    
    df_unified['Found']        = df_unified.exactMatch | df_unified.Close_Match | df_unified.PartialMatch
    
    return df_unified[df_unified.Found == True]

In [3]:
def getBiggestSubStringMatch(document, df, gazetteer):
    matches = []
    for row in df.index:
        fullStart = df.loc[row,'Start_Token']
        fullEnd   = df.loc[row,'End_Token']
        subLength = fullEnd-fullStart-1
        Found     = False
        
        while subLength >1:
            subStart = fullStart
            subEnd   = fullStart + subLength
            
            while subEnd <= fullEnd:
                subString = document[subStart:subEnd].text
                if subString in gazetteer:
                    matches.append(subString)
                    Found = True,
                    break
                subStart += 1
                subEnd   += 1
            if Found:
                    break
            else:
                subLength -= 1
        if Found == False:
            matches.append(np.nan)
    return matches

In [4]:
def findSpecies(doc):
    
    """
    Dictionary approach.
    
    This code runs through token by token looking to match Genus + species, 
    Genus + species + species, G. species etc based on words found in the genus 
    and species columns of the supplied list of species.
    
    Also finds examples where a known genus is followed by an unrecognised species 
    if that example also appears in an abbreviated form elsewhere. 
    """

    foundSpeciesList   = []
    foundSpecies       = ''
    foundSpeciestokens = []
    foundSpeciesDict   = {}
    possiblesNames     = []
    possiblesTokens    = []

    
    for token in doc:
        
        if str(token) in uniqueGenus:
            foundSpeciestokens.append(token)
            foundSpecies = foundSpecies + str(token)
            j            = 1
            onlyGenus    = True
            
            while (doc[token.i + j].text in uniqueSpecies):
                foundSpecies += ' ' + str(doc[token.i + j])
                foundSpeciestokens.append(doc[token.i + j])
                onlyGenus = False
                j        += 1
                
            key = ''
            
            if onlyGenus == False:
                for tkn in foundSpeciestokens[:-1]:
                    key += tkn.text[0] + '. '
                    
                key += foundSpeciestokens[-1].text
                if key not in foundSpeciesDict:
                    foundSpeciesDict[key] = [foundSpecies, True]

                foundSpeciesList.append([foundSpecies, 
                                         foundSpecies, 
                                         True, 
                                         token.i, 
                                         token.i + j, 
                                         token.idx, 
                                         token.sent])
          
            else:
                if doc[token.i + 1].is_alpha:
                    candidate = token.text + ' ' + doc[token.i+1].text
                    possiblesNames.append(candidate)
                    possiblesTokens.append((token, doc[token.i+1]))

                else:
                    foundSpeciesList.append([foundSpecies, 
                                             foundSpecies, 
                                             True, 
                                             token.i, 
                                             token.i + 1, 
                                             token.idx, 
                                             token.sent])

            foundSpeciestokens = []
            foundSpecies       = ''

        elif token.shape_ == 'X.':
            foundSpeciestokens.append(token)
            foundSpeciestokens = []
            foundSpecies       = foundSpecies + str(token)
            j                  = 1
            
            while ((doc[token.i+j].is_alpha and doc[token.i + j].text in uniqueSpecies) or doc[token.i+j].shape_ == 'x.'):
                foundSpeciestokens.append(doc[token.i + j])
                foundSpecies += ' ' + str(doc[token.i + j])
                j            += 1
                
            if doc[token.i + j - 1].text in uniqueSpecies:
                if foundSpecies in foundSpeciesDict:
                    foundSpeciesList.append([foundSpecies, 
                                             foundSpeciesDict.get(foundSpecies)[0], 
                                             foundSpeciesDict.get(foundSpecies)[1], 
                                             token.i, 
                                             token.i + j, 
                                             token.idx, 
                                             token.sent])
                    
                else:
                    # Build pattern
                    pattern = []
                    for tkn in foundSpeciestokens[1:-1]:
                        pattern.append([{"SHAPE"   : "xxxx"}, 
                                        {"TEXT"    : tkn[0].text[0]}]) 
                    pattern.append({"TEXT": foundSpeciestokens[-1].text})  
                    
                    # Create matcher
                    matcher = Matcher(nlp.vocab)
                    matcher.add("Species_Pattern", [pattern])
                    matches = matcher(doc)
                    
                    # Deal with matches
                    if len(matches) > 0:
                        start        = matches[0][1]
                        end          = matches[0][2]
                        matched_span = doc[start:end]
                        inList       = False
                        
                        if doc[start].text in uniqueGenus:
                            inList = True

                        foundSpeciesList.append([foundSpecies, 
                                                 matched_span.text, 
                                                 inList, token.i, 
                                                 token.i + j, 
                                                 token.idx, 
                                                 token.sent])

                        foundSpeciesDict[foundSpecies] = [matched_span.text, inList]

                        if inList == False:
                            for match_id, start, end in matches:
                                matched_span = doc[start:end]
                                foundSpeciesList.append([matched_span.text, 
                                                         matched_span.text, 
                                                         inList, 
                                                         start,
                                                         end, 
                                                         doc[start].idx, 
                                                         doc[start].sent])
            foundSpecies       = ''
            foundSpeciestokens = []
                
    for tkns in possiblesTokens:
        # Generate possible patterns
        shortPattern = [{"SHAPE"   : "X."}, 
                        {"TEXT"    : tkns[0].text[0]},
                        {"TEXT"    : doc[tkns[0].i+1].text}]
        
        fullPattern = [{"SHAPE" : "Xxxxx"},
                       {"TEXT"  : tkns[0].text},
                       {"TEXT"  : doc[tkns[0].i+1].text}]
        
        # Create matcher
        matcher = Matcher(nlp.vocab)
        matcher.add("abbreviated_Species_Pattern", [shortPattern])
        matches = matcher(doc)
        
        # Deal with matches
        if len(matches) > 0:
            matcher.add("full_Species_Pattern", [fullPattern])   
            matches = matcher(doc)
            for match_id, start, end in matches:
                matched_span = doc[start:end]
                foundSpeciesList.append([matched_span.text, 
                                         tkns[0].text + ' '+ tkns[1].text, 
                                         False, 
                                         start, 
                                         end, 
                                         doc[start].idx, 
                                         doc[start].sent])

        else:
            foundSpeciesList.append([tkns[0].text, 
                                     tkns[0].text, 
                                     False, 
                                     tkns[0].i, 
                                     tkns[0].i + 1, 
                                     doc[tkns[0].i].idx, 
                                     doc[tkns[0].i].sent])



    FoundSpeciesDF = pd.DataFrame(foundSpeciesList,columns=['Found as', 
                                                            'Full name', 
                                                            'InDefinitiveList', 
                                                            'Start', 
                                                            'End',
                                                            'PositionOfFirstToken',
                                                            'Sentence'])
    
    FoundSpeciesDF.sort_values('Start', inplace = True)
    FoundSpeciesDF.reset_index(inplace = True,drop = True)
    FoundSpeciesDF['InReferenceList']  = [True if x in uniqueSpeciesStrings else False for x in FoundSpeciesDF['Full name']]
    FoundSpeciesDF.drop_duplicates(keep='first', inplace=True)
    return FoundSpeciesDF

###  Script

In [5]:
# Change the SpaCY model here
# 1 small
# 2 medium
# 3 large
NLPmodel         = 1

# Start by importing location gazetters
print('Load gazetters')
nzgaz            = pd.read_csv("SpreadSheets/Jamies Original/gaz_names.csv")
nzgazAntartica   = nzgaz[nzgaz.land_district.isna()].name.unique()
nzgaz            = nzgaz.name.unique()

SCARGlobal       = pd.read_csv("SpreadSheets/Jamies Original/SCAR_CGA_PLACE_NAMES_GLOBAL_SIMPLIFIED.csv")
SCARGlobalnames  = SCARGlobal.place_name_mapping.unique()
del SCARGlobal

SCARNZ           = pd.read_csv("SpreadSheets/Jamies Original/SCAR_CGA_PLACE_NAMES_NZ_SIMPLIFIED.csv")
SCARNZnames      = SCARNZ.place_name_mapping.unique()
del SCARNZ

GeoNames         = pd.read_csv('SpreadSheets/Jamies Original/GeoNamesAnt.csv', index_col=0)
GeoNamesUnique   = GeoNames.name.unique()
GeoNamesAnt      = GeoNames[GeoNames['country code']=='AQ'].name.unique()
GeoNamesNZ       = GeoNames[GeoNames['country code']=='NZ'].name.unique()
dropWords        = ['Inner', 'Harbour', 'Lake', 'Fig', 'Valleys', 'Soil', 
                    'Lakes', 'South', 'North', 'Inner', 'Mount', 'Frozen', 'Oceans',
                    'Upper', 'Contour']
GeoNamesFiltered = [place for place in GeoNamesUnique if place not in dropWords]
del GeoNames

# Save them as NPYs
print('Save gazetters as NPYs')
np.save('NPYs/nzgazetteer.npy', nzgaz)
np.save('NPYs/nzgazAntartica.npy', nzgazAntartica)
np.save('NPYs/SCARGlobalnames.npy', SCARGlobalnames)
np.save('NPYs/SCARNZnames.npy', SCARNZnames)
np.save('NPYs/GeoNamesAnt.npy', GeoNamesAnt)
np.save('NPYs/GeoNamesNZ.npy', GeoNamesNZ)
np.save('NPYs/GeoNamesFiltered.npy', GeoNamesFiltered)

# Import the species reference list
print('Load species list')
SpeciesReferenceList                  = pd.read_excel('SpreadSheets/Jamies Original/Antarctic_Species_List.xlsx')
SpeciesReferenceList['SpeciesString'] = SpeciesReferenceList[['GENUS', 'SPECIES']].agg(' '.join, axis = 1)

print('Parse species list')
uniqueGenus          = SpeciesReferenceList.GENUS.unique()
uniqueSpecies        = SpeciesReferenceList.SPECIES.unique()
uniqueSpeciesStrings = SpeciesReferenceList.SpeciesString.unique()

# Set source and destination folders
print('Set source and dest. folders')
root_src_dir          = 'ExtractedText\\'
root_dst_dir_matrix   = 'ExtractedLSMatrices\\'
root_dst_dir_species  = 'ExtractedSpecies\\'
root_dst_dir_location = 'ExtractedLocations\\'

# Walk through source folder
print('Iterate folders in source folder')
for src_dir, dirs, files in os.walk(root_src_dir):
    print('')
    dst_dir_matrix = src_dir.replace(root_src_dir, root_dst_dir_matrix, 1)

    # Check destination folder exists
    print('Check destination folder exits')
    print('Dest: ' + dst_dir_matrix)
    if not os.path.exists(dst_dir_matrix):
        os.makedirs(dst_dir_matrix)   

    dst_dir_species = src_dir.replace(root_src_dir, root_dst_dir_species, 1)
    print('Dest: ' + dst_dir_species)
    if not os.path.exists(dst_dir_species):
        os.makedirs(dst_dir_species)   
        
    dst_dir_location = src_dir.replace(root_src_dir, root_dst_dir_location, 1)
    print('Dest: ' + dst_dir_location)
    if not os.path.exists(dst_dir_location):
        os.makedirs(dst_dir_location)  
        
    # Iterate through the folders files
    print('Iterate files')
    for file in files:
        print('')
        
        # Create file paths
        # Species.csv
        print('File: ' + file)
        src_file         = os.path.join(src_dir, file)

        # Species
        dst_file_species = os.path.join(dst_dir_species, file)
        if os.path.exists(dst_file_species):
            # in case of the src and dst are the same file
            if os.path.samefile(src_file, dst_file_species):
                continue
            os.remove(dst_file_species)   
        
        # Location.csv
        dst_file_location = os.path.join(dst_dir_location, file)
        if os.path.exists(dst_file_location):
            # in case of the src and dst are the same file
            if os.path.samefile(src_file, dst_file_location):
                continue
            os.remove(dst_file_location)   

        # Matrix.csv
        dst_file_matrix = os.path.join(dst_dir_matrix, file)
        if os.path.exists(dst_file_matrix):
            # in case of the src and dst are the same file
            if os.path.samefile(src_file, dst_file_matrix):
                continue
            os.remove(dst_file_matrix)   
            
        # Read the .txt to memory
        print('Read text to memory')
        file = open(src_file, encoding = "utf8")
        doc  = file.read()
        file.close()

        # Tokenize the document
        print('Tokenise doc')
        if NLPmodel   == 1:
            document = nlp(doc)
        elif NLPmodel == 2:
            document = nlp_md(doc)
        elif NLPmodel == 3:
            document = nlp_lg(doc)

        # Extract the species 
        print('Extract species')
        species          = findSpecies(document)

        # And locations with filter
        print ('Extract location')
        locations        = find_locations(document)
        locations        = locations[locations.Found == True] 

        # Merge the results
        speciesXlocation = pd.merge(species.assign(key=0), 
                                    locations.assign(key=0), 
                                    on       = 'key', 
                                    suffixes = ('_species', 
                                                '_location')).drop('key', axis=1)

        # Out to CSVs
        print('Out to CSV') 
        
        # Matrix
        dst_file = dst_file_matrix.replace('.txt', '-SLMatrix.csv')
        speciesXlocation.to_csv(dst_file)
        
        # Species
        dst_file = dst_file_species.replace('.txt', '-Species.csv')
        speciesXlocation.to_csv(dst_file)
        
        # Location
        dst_file = dst_file_location.replace('.txt', '-Locations.csv')
        speciesXlocation.to_csv(dst_file)
        
        

Load gazetters


  exec(code_obj, self.user_global_ns, self.user_ns)


Save gazetters as NPYs
Load species list
Parse species list
Set source and dest. folders
Iterate folders in source folder

Check destination folder exits
Dest: ExtractedLSMatrices\
Dest: ExtractedSpecies\
Dest: ExtractedLocations\
Iterate files

Check destination folder exits
Dest: ExtractedLSMatrices\OldSet
Dest: ExtractedSpecies\OldSet
Dest: ExtractedLocations\OldSet
Iterate files

File: Archer2017_Article_EndolithicMicrobialDiversityIn.txt.ann.txt
Read text to memory
Tokenise doc
Extract species
Extract location
Out to CSV

File: fmicb_10_01018.txt.ann.txt
Read text to memory
Tokenise doc
Extract species
Extract location
Out to CSV

File: Fraser2018_Article_EvidenceOfPlantAndAnimalCommun.txt.ann.txt
Read text to memory
Tokenise doc
Extract species
Extract location
Out to CSV

File: s42003_018_0260_y.txt.ann.txt
Read text to memory
Tokenise doc
Extract species
Extract location
Out to CSV

File: source.txt.ann.txt
Read text to memory
Tokenise doc
Extract species
Extract location
Out t

End.