In [4]:
PAPERS_LOCATION='/home/jondea/etymo/workshop/paper_texts/'
def load_paper(paper_id):
    with open(PAPERS_LOCATION + str(paper_id) +'/fulltext.txt', 'r') as myfile:
        data = myfile.read()
    
    return data


In [11]:
example_paper = load_paper(34247)

## Heuristic 1

In [23]:
joining_words = ['of', 'the', 'and', 'at', '&', 'for', 'de', 'on', '-']
stop_chars = ['.', '†', '‡', '¶', '•', '∗', '[', ']', '{', '}', '(', ')']
# Words which make us throw out the whole institution, these should probably be filtered in the scraper
poison_words = ['Proceedings', 'Meeting', 'Annual', 'Quarterly', 'Conference', 'Press', 
                'Transactions', 'Journal', 'Award', 'pages', 'Projects', 'Fellowship']

def is_ignorable_word(word):
    return (len(word) == 0 or (not word[0].isupper() and word not in joining_words))

def search_with_context1(text, simple_context=False, before=20, after=20):
    queries = ['University']
    i = 0
    first_letters = dict()
    for s in ['University']:
        if s[0] not in first_letters:
            first_letters[s[0]] = [s]
        else:
            first_letters[s[0]].append(s)
    
    matches_with_context=[]
    # Loop through entire text
    for c in text:
        # Throw out most based on the first character
        if c in first_letters:
            potential_matches = first_letters[c]
            match=None
            for potential_match in potential_matches:
                j=i+1
                match_found=True
                for pc in potential_match[1:]:
                    if pc != text[j]:
                        match_found=False
                        break
                    j += 1
                # Check it is end of word, for example we don't want to pick up "Label" as "Lab"                     
                if not (text[j].isspace() or text[j] in stop_chars):
                    match_found = False

                if match_found:
                    match=potential_match
                    break
            if match:
                context_end = 0
                context_start = 0
                   
                context_end = min(i+len(match)+after,len(text))
                context_start = max(i-before,0)
                
                match_with_context = text[context_start:context_end].strip()

                matches_with_context.append(match_with_context)

        i += 1

    return matches_with_context

In [25]:
search_with_context1(example_paper)

['telligence, Wroclaw\nUniversity of Science and Tech',
 'of Physics, Warsaw University of Technology,\nPola',
 'Political Science, University of Naples\nFederico',
 'an town. Manchester University Press.\n\n23. Kivelä',
 'Systems Laboratory, University of Melbourne, Techn',
 'y of communication. University of Illinois\n\nMedia,']

## Heuristic 2

In [21]:
institution_synonyms = ['University', 'Institute', 'Instituition', 'Faculty', 'Academy', 'Center', 'Centre', 'Laboratory', 'Research', 'Labs', 'Lab']
joining_words = ['of', 'the', 'and', 'at', '&', 'for', 'de', 'on', '-']
stop_chars = ['.', '†', '‡', '¶', '•', '∗', '[', ']', '{', '}', '(', ')']
# Words which make us throw out the whole institution, these should probably be filtered in the scraper
poison_words = ['Proceedings', 'Meeting', 'Annual', 'Quarterly', 'Conference', 'Press', 
                'Transactions', 'Journal', 'Award', 'pages', 'Projects', 'Fellowship']

def is_ignorable_word(word):
    return (len(word) == 0 or (not word[0].isupper() and word not in joining_words))

def search_with_context(text, simple_context=False, before=20, after=20):
    i = 0
    first_letters = dict()
    for s in institution_synonyms:
        if s[0] not in first_letters:
            first_letters[s[0]] = [s]
        else:
            first_letters[s[0]].append(s)
    
    matches_with_context=[]
    # Loop through entire text
    for c in text:
        # Throw out most based on the first character
        if c in first_letters:
            potential_matches = first_letters[c]
            match=None
            for potential_match in potential_matches:
                j=i+1
                match_found=True
                for pc in potential_match[1:]:
                    if pc != text[j]:
                        match_found=False
                        break
                    j += 1
                # Check it is end of word, for example we don't want to pick up "Label" as "Lab"                     
                if not (text[j].isspace() or text[j] in stop_chars):
                    match_found = False

                if match_found:
                    match=potential_match
                    break
            if match:
                context_end = 0
                context_start = 0
                if simple_context:
                    # With simple context we just take a fixed number before and after, used for debugging  
                    context_end = min(i+len(match)+after,len(text))
                    context_start = max(i-before,0)
                else:
                    # Crawl forward
                    context_end = i + len(match) +1
                    current_word = ''
                    while context_end < len(text):
                        if text[context_end] in stop_chars or text[context_end].isdigit():
                            if is_ignorable_word(current_word):
                                context_end -= len(current_word)
                            break
                        elif text[context_end].isspace():
                            # found a whole word
                            if is_ignorable_word(current_word):
                                context_end += len(current_word)
                                break
                            current_word = ''
                        else:
                            current_word = current_word + text[context_end]
                        
                        context_end += 1
                    
                    # Crawl backwards
                    context_start = i-2
                    current_word = ''
                    while context_start >= 0:
                        if text[context_start] in stop_chars or text[context_start].isdigit():
                            if is_ignorable_word(current_word):
                                context_start += len(current_word)
                            context_start += 1
                            break
                        elif text[context_start].isspace():
                            # found a whole word
                            if is_ignorable_word(current_word):
                                context_start += len(current_word)+1
                                break
                            current_word = ''
                        else:
                            current_word = text[context_start] + current_word
                        
                        context_start -= 1
                
                match_with_context = text[context_start:context_end].strip()
                
                # Check if whole string is poisoned (contains a string we know never appears in institutions)
                poisoned = False
                for poison_word in poison_words:
                    if poison_word in match_with_context:
                        poisoned = True
                        break
                
                
                if not poisoned:
                    matches_with_context.append(match_with_context)

        i += 1

    return matches_with_context

In [22]:
search_with_context(example_paper)

['Research',
 'Department of Computational Intelligence, Wroclaw\nUniversity of Science and Technology, Poland',
 '2Faculty of Physics, Warsaw University of Technology,\nPoland',
 'Faculty of Physics, Warsaw University of Technology,\nPoland',
 'Department of Political Science, University of Naples\nFederico II, Italy',
 'National Science Centre Poland, the decision no. DEC',
 'Grid\nComputing and Distributed Systems Laboratory, University of Melbourne, Technical Report',
 'University of Illinois',
 'Victoria Institute of Secondary']