In [209]:
## new features to add
# 1. Is acronym -- needs preprocessing. Not working because the data contains regular words as complete capital
# 2. within quotes -- no words found

# 3. Word Bigrams(previous two) -- done
# 4. POS bigrams(previous two) -- done

In [220]:
from read_conll_data import *
from collections import defaultdict 
import re
import spacy

In [221]:
# Checking whether the token is an acronym
def is_acronym(token):
    return token.upper() == token

# Checking whether the token is between quotes
def is_within_quotes(token):
    return bool(token[0] == token[-1]) and bool(token[0] in ['"', "'"])

def get_word_and_pos_bigrams(data, nlp):
    word_bigrams = defaultdict(set)
    pos_bigrams = defaultdict(set)

    for iob_sent in data.iob_sents()[0:10]:
        prev_words, prev_pos = [], []
        for iob_word in iob_sent:
            token = iob_word[0].lower()
            pos = iob_word[1]
            tag = iob_word[-1]

            if len(prev_words) == 2:
                word_bigrams[tag].add(tuple(prev_words))
                pos_bigrams[tag].add(tuple(prev_pos))
                
                prev_words.pop(0)
                prev_pos.pop(0)

            prev_words.append(token)
            prev_pos.append(pos)
            
            
#             sp_token = nlp(token)[0]
#             lemma, suffix = sp_token.lemma_, sp_token.suffix_
            
#             if lemma.lower() != sp_token.lower_:
#                 print(lemma, token, suffix)
#                 suffixes[tag].add(suffix.lower())
                
    return word_bigrams, pos_bigrams

In [248]:
def is_capital(word):
    return int(bool(re.search("^[A-Z]", word)))

def extract_features_and_labels(trainingfile, word_bigrams, pos_bigrams):
    
    data = []
    targets = []
    with open(trainingfile, 'r', encoding='utf8') as infile:
        is_prev_word_period = True # set to true so that the first word of the file is considered for
        # being 'the first word' check.
        prev_two_words, prev_two_pos = [], []
        
        for line in infile:
            components = line.rstrip('\n').split()
            if len(components) > 0:
                
                is_first_word = False
                if is_prev_word_period and not bool(re.search('^[_\W]*?$', components[0])):
                    is_first_word = True
                
                token, pos = components[0:2]
                
                word_bigram_present_dict = {"word_BI_"+key: 0 for key in word_bigrams.keys()}
                pos_bigram_present_dict = {"pos_BI_"+key: 0 for key in pos_bigrams.keys()}
                
                if is_first_word:
                    prev_two_words, prev_two_pos = [], []
                    prev_two_words.append(token.lower())
                    prev_two_pos.append(pos)
                else:
                    if len(prev_two_words) == 2:
                        for key, values in word_bigrams.items():
                            if tuple(prev_two_words) in values:
                                word_bigram_present_dict["word_BI_"+key] = 1
                        
                        for key, values in pos_bigrams.items():
                            if tuple(prev_two_pos) in values:
                                pos_bigram_present_dict["pos_BI_"+key] = 1
                                
                        prev_two_words.pop(0)
                        prev_two_pos.pop(0)
                        
                    prev_two_words.append(token.lower())
                    prev_two_pos.append(pos)
                    
                
                is_word_capital = is_capital(components[0])
                feature_dict = {'token':token, 'pos': pos, 'is_first_word': is_first_word, 
                                'is_capital': is_word_capital}
                
                feature_dict.update(word_bigram_present_dict)
                feature_dict.update(pos_bigram_present_dict)
                
                data.append(feature_dict)
                #gold is in the last column
                targets.append(components[-1])
                
                is_prev_word_period = bool(components[0] == '.') and bool(components[-1] == 'O')
                    
    return data, targets


# def extract_features(inputfile):
   
#     data = []
#     with open(inputfile, 'r', encoding='utf8') as infile:
#         is_prev_word_period = True # set to true so that the first word of the file is considered for
#         # being 'the first word' check.
#         for line in infile:
#             components = line.rstrip('\n').split()
#             if len(components) > 0:
                
#                 is_first_word = False
#                 if is_prev_word_period and not bool(re.search('^[_\W]*?$', components[0])):
#                     is_first_word = True
                    
#                 token, pos = components[0:2]
#                 is_word_capital = is_capital(components[0])
#                 feature_dict = {'token':token, 'pos': pos, 'is_first_word': is_first_word, 
#                                 'is_capital': is_word_capital}
#                 data.append(feature_dict)
#                 is_prev_word_period = bool(components[0] == '.') and bool(components[-1] == 'O')
#     return data

In [249]:
root_filename = "../../data/"
datafiles = ["conll2003.train.conll", "conll2003.dev.conll"]
column_types = ["words", "pos", "chunk", "ne"]

data = read_data(root_filename, datafiles, column_types)

nlp = spacy.load("en_core_web_sm")

## Preprocessing

In [250]:
word_bigrams, pos_bigrams = get_word_and_pos_bigrams(data, nlp)

In [251]:
data, targets = extract_features_and_labels(root_filename+datafiles[0], word_bigrams, pos_bigrams)

['B-ORG',
 'O',
 'B-MISC',
 'O',
 'O',
 'O',
 'B-MISC',
 'O',
 'O',
 'B-PER',
 'I-PER',
 'B-LOC',
 'O',
 'O',
 'B-ORG',
 'I-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-MISC',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-MISC',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'I-ORG',
 'O',
 'O',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'O',
 'O',
 'O',
 'B-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'I-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'O',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'O'

In [216]:
# acronyms = set()
# within_quotes = set()
        
#     token = iob_word[0].lstrip(" ").rstrip(" ")
    
    # Acronym check
#     if len(token) > 1 and not bool(re.search("[^a-zA-Z]+$", token)):
#         if is_acronym(token):
#             acronyms.add(token)

    
    # Within quotes check
#     if token[0] in ["'", '"']:
#         print(token)
#     if is_within_quotes(token):
#         within_quotes.add(token)