In [1]:
# Download the paper texts (see README.md for details) and store anywhere,
# then put the absolute path here. Remember that on windows you have to 
# type the path as \\location\\folder\\file.txt for example
PAPERS_LOCATION='/home/jondea/etymo/workshop/paper_texts/'

def load_paper(paper_id):
    with open(PAPERS_LOCATION + str(paper_id) +'/fulltext.txt', 'r') as myfile:
        data = myfile.read()
    
    return data

import os
all_paper_ids = os.listdir(PAPERS_LOCATION)

In [2]:
example_paper = load_paper(all_paper_ids[1])

## Heuristic 1

In [3]:
stop_chars = ['.', '†', '‡', '¶', '•', '∗', '[', ']', '{', '}', '(', ')']
def search_with_simple_context(text, before=20, after=20):
    queries = ['University']
    i = 0
    first_letters = dict()
    for s in ['University']:
        if s[0] not in first_letters:
            first_letters[s[0]] = [s]
        else:
            first_letters[s[0]].append(s)
    
    matches_with_context=[]
    # Loop through entire text
    for c in text:
        # Throw out most based on the first character
        if c in first_letters:
            potential_matches = first_letters[c]
            match=None
            for potential_match in potential_matches:
                j=i+1
                match_found=True
                for pc in potential_match[1:]:
                    if pc != text[j]:
                        match_found=False
                        break
                    j += 1
                # Check it is end of word, for example we don't want to pick up "Label" as "Lab"                     
                if not (text[j].isspace() or text[j] in stop_chars):
                    match_found = False

                if match_found:
                    match=potential_match
                    break
            if match:
                context_end = 0
                context_start = 0
                   
                context_end = min(i+len(match)+after,len(text))
                context_start = max(i-before,0)
                
                match_with_context = text[context_start:context_end].strip()

                matches_with_context.append(match_with_context)

        i += 1

    return matches_with_context

In [4]:
search_with_simple_context(example_paper)

['al Sciences, Dalian University of Technology, Dali']

## Heuristic 2

### Section title detection

In [5]:
# The kinds of roman numerals we may see in section titles
roman_numerals = ['i', 'v', 'x']

# Does it look like a section title?
def is_a_section_title(txt, i, section_name):
    j = i + len(section_name)
    # Apart from whitespace and punctuation is it at the end of the line
    while j < len(txt):
        c = txt[j]
        j += 1
        if c == '\n':
            # Found the newline, all is good
            break
        if c.isspace() or (c in string.punctuation):
            # Space or punctuation, keep looking for newline
            continue
        else:
            return False
    
    # Apart from whitespace, digits and punctuation is it at the start of the line
    while i >= 0:
        i -= 1
        c = txt[i]
        if c == '\n':
            return True
        if c.isspace() or c.isdigit() or (c in string.punctuation) or c.lower() in roman_numerals:
            continue
        else:
            return False

    return True

In [6]:
# Test this small function
section_title1 = """
s paper.

REFERENCES
[1] A. K.
"""
section_i1=11
section_name1='references'
assert(is_a_section_title(section_title1, section_i1, section_name1))

section_title2 = """
Jean-Sébastien Gharbi, André Lapied,
Bernard Roy, Stéphane Deparis for very helpful comments.

References
M. Allais. The So-Called Allais Paradox and Rational Decisions under Uncer-
tainty. In M. Allais an
"""
section_i2=96
section_name2='references'
assert(is_a_section_title(section_title2, section_i2, section_name2))

section_title3 = """
A paragraph which discusses the word references
and has a new line.
"""
section_i3=38
section_name3='references'
assert(not is_a_section_title(section_title3, section_i3, section_name3))

section_title4 = """

6. REFERENCES 

"""
section_i4=5
section_name4='references'
assert(is_a_section_title(section_title4, section_i4, section_name4))

section_title5 = """

IV) REFERENCES 

"""
section_i5=6
section_name5='references'
assert(is_a_section_title(section_title5, section_i5, section_name5))

print("Section title detection tests passed")

NameError: name 'string' is not defined

### Algorithm

In [None]:
import string

institution_synonyms = ['University', 'Universiteit', 'Université', 'Universitas',
                        'Institute', 'Instituition',
                        'Center', 'Centre',
                        'Laboratory', 'Labs', 'Lab', 'Laboratoire',
                        'Academy', 'Research',
                        'Inc', 'Limited', 'PLC', 'LLP', 'GmbH', 'AG']

joining_words = ['of', 'the', 'and', 'at', '&', 'for', 'de', 'on', '-']
stop_chars = ['.', '†', '‡', '¶', '•', '∗', '[', ']', '{', '}', '(', ')', ',', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

# Words which make us stop the whole process if we find them on their own
kill_words = ['references']

# Words which make us throw out the whole institution, these should probably be filtered in the scraper
poison_words = ['Proceedings', 'Meeting', 'Annual', 'Quarterly', 'Conference', 'Press', 
                'Transactions', 'Journal', 'Award', 'pages', 'Projects', 'Fellowship']

def is_stop_word(word):
    if len(word) == 0:
        return True
    if word[0].islower() and word not in joining_words:
        return True
    return False

def is_first_letter_of_word(text, i):
    if i>0 and text[i-1] in string.ascii_letters:
        return False
    return True

def search_with_context(text, simple_context=False, before=20, after=20):
    
    institution_first_letters = dict()
    for s in institution_synonyms:
        if s[0] not in institution_first_letters:
            institution_first_letters[s[0]] = [s]
        else:
            institution_first_letters[s[0]].append(s)
            
    kill_words_first_letters = dict()
    for s in kill_words:
        if s[0] not in kill_words_first_letters:
            kill_words_first_letters[s[0]] = [s]
        else:
            kill_words_first_letters[s[0]].append(s)
    
    matches_with_context=[]

    # Loop through entire text character by character
    i = -1
    for c in text:
        i += 1            
        
        # Kill words stop the whole process
        # ---------------------------------
        kill = False
        if is_first_letter_of_word(text, i) and c.lower() in kill_words_first_letters:
            potential_matches = kill_words_first_letters[c.lower()]
            for potential_match in potential_matches:
                if text[i:i+len(potential_match)].lower() == potential_match:                    
                    if is_a_section_title(text, i, potential_match):
                        kill = True
                        break
        # Stop the whole search
        if kill:
            break
        
        # Throw out most based on the first character
        # Note this is case sensitive because we want University (for example) to be capitalised
        if is_first_letter_of_word(text, i) and c in institution_first_letters:
            potential_matches = institution_first_letters[c]
            match=None
            for potential_match in potential_matches:
                # Loop through letters starting at the second letter (we've already checked the first)
                j=i+1
                match_found=True
                for pc in potential_match[1:]:
                    if pc != text[j]:
                        match_found=False
                        break
                    j += 1
                # Check it is end of word, for example we don't want to pick up "Label" as "Lab"                     
                if not (text[j].isspace() or text[j] in stop_chars):
                    match_found = False

                if match_found:
                    match=potential_match
                    break

            if match:

                # Crawl forward
                # -------------
                context_end = i + len(match)
                crawler_i = context_end
                current_word = match
                stop = False
                while context_end < len(text) and not stop:

                    # Found a stop char, check the word we have then stop
                    if text[crawler_i] in stop_chars:
                        stop = True
                                        
                    if text[crawler_i].isspace() or stop:
                        # found a whole word
                        if is_stop_word(current_word):
                            break
                        if len(current_word) > 0 and current_word[0].isupper():
                            context_end = crawler_i
                        current_word = ''
                    else:
                        current_word = current_word + text[crawler_i]

                    crawler_i += 1

                # Crawl backwards
                # ---------------
                context_start = i
                crawler_i = context_start
                current_word = match
                stop = False
                while crawler_i >= 0 and not stop:
                    # Found a stop char, check the word we have then stop
                    if text[crawler_i] in stop_chars:
                        stop = True
                    
                    if text[crawler_i].isspace() or stop:
                        # found a whole word
                        if is_stop_word(current_word):
                            break
                        if len(current_word) > 0 and current_word[0].isupper():
                            context_start = crawler_i + 1
                        current_word = ''
                    else:
                        current_word = text[crawler_i] + current_word

                    crawler_i -= 1
                
                match_with_context = text[context_start:context_end].strip()
                
                # Check if whole string is poisoned (contains a string we know never appears in institutions)
                poisoned = False
                for poison_word in poison_words:
                    if poison_word in match_with_context:
                        poisoned = True
                        break
                
                # Remove any which are JUST institution synonyms
                trivial = False
                if match_with_context in institution_synonyms:
                    trivial = True
                
                if not poisoned and not trivial:
                    # Clean up and add
                    match_with_context = match_with_context.replace('\n', ' ')
                    matches_with_context.append(match_with_context)

    return matches_with_context

In [None]:
search_with_context(load_paper(all_paper_ids[0]))

### Tests

In [None]:
test_sets = {
    37861:['Technische Universiteit Eindhoven', 'University of Pennsylvania'],
    38150:['Dalian University of Technology'],
    39023:['King Abdullah University of Science and Technology', 'Computational Bioscience Research Center'],
    38488:['Indian Institute of Technology Madras'],
    37210:['Research Center on Fictitious Economy and Data Science','Kunming University of Science and Technology','University of Chinese Academy of Sciences'],
    37435:['Centre for Signal Processing','Hong Kong Polytechnic University','University of Technology, Sydney'],
    37835:['Université Paris-Dauphine', 'PSL Research University'],
    39572:['Universitas Brawijaya'],
    36994:[],
    34904:['Erasmus Medical Center', 'IBM Research Shanghai'],
    37331:['University of Chinese Academy of Sciences','University of Technology Sydney'],
    38300:['University of Electronic Science and Technology of China','Shanghai Institute for Biological Sciences','Chinese Academy of Sciences'],
    38732:['ParallelDots, Inc.'],
    40424:['University of Maribor', 'Adolf Drolc Healthcare Centre', 'Center for International Cooperation']
}

In [None]:
average_score = 0.0
counter = 1
for paper_id, institutions in test_sets.items():
    institutions_found = set(search_with_context(load_paper(paper_id)))
    institutions_set = set(institutions)
    true_positives = len(institutions_found.intersection(institutions_set))
    false_positives =  len(institutions_found - institutions_set)
    false_negatives =  len(institutions_set - institutions_found)
    
    # Calculate f1 score (but with two special cases)
    if institutions_found == institutions_set:
        # This covers case where the two sets are empty
        score = 1.0
    elif true_positives == 0:
        # Got nothing right
        score = 0.0
    else:
        try:
            precision = true_positives / (true_positives + false_positives)
            recall = true_positives / (true_positives + false_negatives)
            score = 2*(precision*recall)/(precision + recall)
        except:
            print("paper_id:", paper_id, "true", true_positives, "false positives:", false_positives, "false negatives:", false_negatives, "score: N/A",)
            continue
    average_score = average_score*counter/(counter+1) + score/(counter+1)
    counter += 1
    print("paper_id:", paper_id, "true", true_positives, "false positives:",
          false_positives, "false negatives:", false_negatives, "score:", score)

print()
print("Average score:", average_score)

In [None]:
# Verbose debugging
for paper_id, institutions in test_sets.items():
    print("paper_id", paper_id)
    print("True answer", institutions)
    institutions_found = search_with_context(load_paper(paper_id))
    print("I found", institutions_found)
    print("True ones", set(institutions_found).intersection(set(institutions)))
    print("I found extra", set(institutions_found) - set(institutions))
    print("I missed", set(institutions) - set(institutions_found))
    print()