In [3]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../../..')

from allennlp.data import Token
from allennlp.data.fields import TextField
from allennlp.data.dataset_readers.conll2003 import Conll2003DatasetReader
import pandas as pd
from allennlp.common.params import Params
from wiser.lf import LabelingFunction, LinkingFunction, DictionaryMatcher

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Loads Data

In [4]:
root_directory = '../../..'

reader = Conll2003DatasetReader(coding_scheme="BIOUL")
train_data = reader.read(root_directory + '/data/conll/eng.train')
dev_data = reader.read(root_directory + '/data/conll/eng.testa')
test_data = reader.read(root_directory + '/data/conll/eng.testb')

14041it [00:01, 13459.07it/s]
3250it [00:00, 10188.35it/s]
3453it [00:00, 14033.52it/s]


In [5]:
conll_docs = train_data + dev_data + test_data

Uses Spacy to add NLP information to documents.

In [6]:
import spacy
from spacy.tokens.doc import Doc

nlp = spacy.load('en_core_web_sm')

spacy_docs = []
for doc in conll_docs:
    words = [token.text for token in doc['tokens']]
    spaces = [True] * (len(words) - 1)
    spaces.append(False)
    spacy_doc = Doc(nlp.vocab, words=words, spaces=spaces)

    for name, proc in nlp.pipeline:
        spacy_doc = proc(spacy_doc)
    
    tokens = [Token(token.text,
                    token.idx,
                    token.lemma_,
                    token.pos_,
                    token.tag_,
                    token.dep_,
                    token.ent_type_) for token in spacy_doc]
    
    doc.add_field('tokens', TextField(tokens, doc['tokens']._token_indexers))

Loads dictionaries from DBpedia and other terms.

In [5]:
people = set()
with open('entity_data/people.txt', 'r') as f:
    for line in f.readlines():
        if line[0] != '#':
            people.add(line.strip())

countries = set()
with open('entity_data/countries.txt', 'r') as f:
    for line in f.readlines():
        if line[0] != '#':
            countries.add(line.strip())

pop_places = set()
with open('entity_data/populated_places.txt', 'r') as f:
    for line in f.readlines():
        if line[0] != '#':
            pop_places.add(line.strip())

orgs = set()
with open('entity_data/organisations.txt', 'r') as f:
    for line in f.readlines():
        if line[0] != '#':
            orgs.add(line.strip())

yago_orgs = set()
with open('entity_data/yago_orgs.txt', 'r') as f:
    for line in f.readlines():
        if line[0] != '#':
            yago_orgs.add(line.strip())

yago_companies = set()
with open('entity_data/yago_companies.txt', 'r') as f:
    for line in f.readlines():
        if line[0] != '#':
            yago_companies.add(line.strip())

demonyms = set()
with open('entity_data/demonyms.txt', 'r') as f:
    for line in f.readlines():
        if line[0] != '#':
            demonyms.add(line.strip())
            
days_of_the_week = {"Sunday", "Monday", "Tuesday", "Wednesday",
                    "Thursday", "Friday", "Saturday"}

months_of_the_year = {"January", "February", "March", "April",
                      "May", "June", "July", "August", "September",
                      "October", "November", "December"}

## Labeling Functions - PER

In [6]:
people_terms = [person.split(" ") for person in people]
lf = DictionaryMatcher("DBpediaPeople", people_terms, i_label="I-PER")
lf.apply(conll_docs)

In [7]:
first_name_counts = {}
for name in people:
    name = name.split(" ")
    if len(name) == 2:
        if name[0] in first_name_counts:
            first_name_counts[name[0]] += 1
        else:
            first_name_counts[name[0]] = 1
            
first_names = set()
for first_name, count in first_name_counts.items():
    if count > 10:
        first_names.add(first_name)

first_names -= countries
first_names -= pop_places
first_names -= days_of_the_week
first_names -= months_of_the_year
first_names -= demonyms
first_names -= {"The", "He", "East", "West", "North", "South"}

lf = DictionaryMatcher("FirstNames", [[x] for x in first_names], i_label="I-PER")
lf.apply(conll_docs)

In [8]:
second_name_counts = {}
for name in people:
    name = name.split(" ")
    if len(name) == 2:
        if name[1] in second_name_counts:
            second_name_counts[name[1]] += 1
        else:
            second_name_counts[name[1]] = 1
            
second_names = set()
for second_name, count in second_name_counts.items():
    if count > 2 and len(second_name) > 3:
        second_names.add(second_name)
        
second_names -= countries
second_names -= pop_places
second_names -= days_of_the_week
second_names -= months_of_the_year
second_names -= demonyms
second_names -= {"The", "He", "East", "West", "North", "South"}

lf = DictionaryMatcher("SecondNames", [[x] for x in second_names], i_label="I-PER")
lf.apply(conll_docs)

In [9]:
class Said(LabelingFunction):
    def apply_instance(self, instance):
        labels = ['ABS'] * len(instance['tokens'])
        text = [token.text for token in instance['tokens']]
        
        for i in range (1, len(text) - 1):
            if text[i] == "said" and text[i-1][0].isupper():
                labels[i-1] = 'I-PER'
            elif text[i] == "said" and text[i+1][0].isupper():
                labels[i+1] = 'I-PER'
        
        return labels

lf = Said()
lf.apply(conll_docs)

## Labeling Functions - LOC

In [10]:
country_terms = [country.lower().split(" ") for country in countries]
lf = DictionaryMatcher("DBpediaCountries", country_terms,
                       i_label="I-LOC", match_lemmas=True)
lf.apply(conll_docs)

In [11]:
pop_place_terms = [pop_place.lower().split(" ") for pop_place in pop_places]
lf = DictionaryMatcher("DBpediaPopPlaces", pop_place_terms,
                       i_label="I-LOC", match_lemmas=True)
lf.apply(conll_docs)

In [12]:
abbrv_terms = [
    ["U.S."], ["US"], ["U.S.A."], ["USA"], ["UAE"]
]
lf = DictionaryMatcher("CountryAbbrvs", abbrv_terms, i_label="I-LOC")
lf.apply(conll_docs)

## Labeling Functions - ORG

In [13]:
org_terms = [org.split(" ") for org in orgs]
lf = DictionaryMatcher("DBpediaORG", org_terms, i_label="I-ORG")
lf.apply(conll_docs)

In [14]:
yago_org_terms = [yago_org.split(" ") for yago_org in yago_orgs]
lf = DictionaryMatcher("YAGO_ORG", yago_org_terms, i_label="I-ORG")
lf.apply(conll_docs)

In [15]:
yago_company_terms = [
    yago_company.split(" ") for yago_company in yago_companies
]
lf = DictionaryMatcher("YAGO_Companies", yago_company_terms, i_label="I-ORG")
lf.apply(conll_docs)

In [16]:
class OrgLastWord(LabelingFunction):

    org_last = {
        'Observatory', 'Exchange', 'University', 'Co', 'Ltd', 'Airport',
        'Inc', 'Inc.', 'Corp', 'Corp.', 'School', 'Enterprise', 'Education', 'Research', 
        'Development', 'Heritage', 'Technology', 'Infrastructure', 'Networks', 
        'Chambers', 'Academy', 'Hotels', 'Fund', 'Studios', 'Agency', 
        'Bureau', 'Treasury', 'Newsroom', 'Federation', 'League', 'Club', 'Group', 'Force', 
        'Department', 'Administration', 'Campaign', 'Authority', 'Party', 
        'Organisation', 'Council', 'Community', 'Newsroom', 'Desk', 'Management', 
        'Association', 'Reserve', 'Committee', 'Power', 'Bank', 'Services', 'Copper', 
        'Managers', 'Newsdesk', 'Radio', 'Authority', 'Commerce', 'Cables', 'Group', 
        'Labour', 'Office', 'Congress', 'Corporation', 'Front', 'Court', 'Community', 
        'Software', 'Pizza', 'Cargo', 'Books', 'Casinos', 'Bar', 'College', 
        'Commando', 'Force', 'Army', 'District', 'Movies', 'Center', 
        'Entertainment', 'Channels', 'Interactive', 'Systems', 'Pictures', 
        'Talk', 'Media', 'Stations', 'Park', 'Arts', 'Office', 'Analytics', 
        'Production', 'Basketball', 'Restaurant', 'Center', 'Pictures', 
        'Brewery', 'Institute', 'Partners', 'Forum', 'Foundation', 
        'Crafts', 'Nursery', 'Publications', 'Cars', 'Society', 'Sciences'}
    
    def apply_instance(self, instance):
        labels = ['ABS'] * len(instance['tokens'])
        
        for i in range(1, len(instance['tokens'])):
            if instance['tokens'][i].text in self.org_last:
                if instance['tokens'][i-1].text[0].isupper():
                    labels[i-1] = 'I-ORG'
                labels[i] = 'I-ORG'
        
        return labels
    
lf = OrgLastWord()
lf.apply(conll_docs)

## Labeling Functions - MISC

In [17]:
additional_terms = {"Democrat", "Republican", "Liberal", "Conservative", "Arab",
                    "Christian", "Muslim", "Jewish", "Buddhist", "Hindu"}
temp = demonyms | additional_terms
lf = DictionaryMatcher("DBPediaMISC", [term.lower().split(" ") for term in temp if len(term) > 2],
                       match_lemmas=True, i_label="I-MISC")
lf.apply(conll_docs)

In [18]:
class MiscLastWord(LabelingFunction):

    misc_last = {'Cup', 'Open', 'Championship', 'Festival', 'League', 'Tour', 'Tournament', 
                 'War', 'Revolution', 'Act', 'Treaty', 'Symposium', 'Day', 'Series', 
                 'Game', 'Central', 'Network', 'Division', 'Baseball', 'Enterprise', 'Protocol'}
    
    def apply_instance(self, instance):
        labels = ['ABS'] * len(instance['tokens'])
        
        for i in range(1, len(instance['tokens'])):
            if instance['tokens'][i].text in self.misc_last:
                labels[i-1] = 'I-MISC'
                labels[i] = 'I-MISC'
        
        return labels
    
lf = MiscLastWord()
lf.apply(conll_docs)

In [19]:
class MiscAdj(LabelingFunction):
    misc_adj ={'based', 'bound', 'born', 'ruled', 'backed', 'listed'}
    
    def apply_instance(self, instance):
        tokens = [token.text for token in instance['tokens']]
        labels = ['ABS'] * len(instance['tokens'])
        
        for i in range(len(tokens)):
            if tokens[i].split("-")[-1].lower() in self.misc_adj:
                if tokens[i][0].isupper():
                    labels[i] = 'I-MISC'
                else:
                    labels[i] = 'O'
        
        return labels

lf = MiscAdj()
lf.apply(conll_docs)

## Labeling Functions - O

In [20]:
class VerbOrAdv(LabelingFunction):
    pos = {"VERB", "ADV"}
    def apply_instance(self, instance):
        labels = ['ABS'] * len(instance['tokens'])
        
        for i, pos in enumerate([token.pos_ for token in instance['tokens']]):
            if pos in self.pos:
                labels[i] = 'O'
        
        return labels

lf = VerbOrAdv()
lf.apply(conll_docs)

In [21]:
class Punctuation(LabelingFunction):
    pos = {"PUNCT"}
    def apply_instance(self, instance):
        labels = ['ABS'] * len(instance['tokens'])
        
        for i, pos in enumerate([token.pos_ for token in instance['tokens']]):
            if pos in self.pos:
                labels[i] = 'O'
        
        return labels

lf = Punctuation()
lf.apply(conll_docs)

In [22]:
class Pronouns(LabelingFunction):
    pos = {"PRON"}
    def apply_instance(self, instance):
        labels = ['ABS'] * len(instance['tokens'])
        
        for i, pos in enumerate([token.pos_ for token in instance['tokens']]):
            if pos in self.pos:
                labels[i] = 'O'
        
        return labels

lf = Pronouns()
lf.apply(conll_docs)

In [23]:
class Numbers(LabelingFunction):
    pos = {"NUM"}
    def apply_instance(self, instance):
        labels = ['ABS'] * len(instance['tokens'])
        
        for i, pos in enumerate([token.pos_ for token in instance['tokens']]):
            if pos in self.pos:
                labels[i] = 'O'
        
        return labels

lf = Numbers()
lf.apply(conll_docs)

In [24]:
class LongLowerCase(LabelingFunction):
    def apply_instance(self, instance):
        labels = ['ABS'] * len(instance['tokens'])
        
        for i, text in enumerate([token.text for token in instance['tokens']]):
            if not text[0].isupper() and len(text) > 4:
                labels[i] = 'O'
        
        return labels

lf = LongLowerCase()
lf.apply(conll_docs)

In [25]:
class ConsecutiveLowerCase(LabelingFunction):
    def apply_instance(self, instance):
        labels = ['ABS'] * len(instance['tokens'])
        text = [token.text for token in instance['tokens']]
        
        for i in range (1, len(text) - 1):
            if not text[i-1][0].isupper() and not text[i][0].isupper() and not text[i+1][0].isupper():
                labels[i] = 'O'
        
        return labels

lf = ConsecutiveLowerCase()
lf.apply(conll_docs)

## Linking Functions

In [None]:
from wiser.lf import ElmoLinkingFunction

lf = ElmoLinkingFunction(.75)
lf.apply(conll_docs)

In [None]:
class CompoundPhrase(LinkingFunction):
    def apply_instance(self, instance):
        links = [0] * len(instance['tokens'])
        for i in range(1, len(instance['tokens'])):
            if instance['tokens'][i-1].dep_ == "compound":
                links[i] = 1
        
        return links

lf = CompoundPhrase()
lf.apply(conll_docs)

In [None]:
class ConsecutiveCapitals(LinkingFunction):
    def apply_instance(self, instance):
        links = [0] * len(instance['tokens'])
        # We skip the first pair since the first
        # token is almost always capitalized
        for i in range(2, len(instance['tokens'])):
            # We skip this token if it all capitals
            all_caps = True
            text = instance['tokens'][i].text
            for char in text:
                if char.islower():
                    all_caps = False
                    break
            
            if not all_caps and text[0].isupper() \
            and instance['tokens'][i-1].text[0].isupper():
                links[i] = 1
        
        return links

lf = ConsecutiveCapitals()
lf.apply(conll_docs)

## Saves Weak Supervision to Disk

In [None]:
import pickle

with open('tmp/train_data.p', 'wb') as f:
    pickle.dump(train_data, f)

with open('tmp/dev_data.p', 'wb') as f:
    pickle.dump(dev_data, f)
    
with open('tmp/test_data.p', 'wb') as f:
    pickle.dump(test_data, f)

End of Part 1