In [1]:
import re

In [2]:
p = re.compile('([0-9]*\.?[0-9]*) ?(\( ([0-9]*) ([a-z]*) \))? ([a-z]*) (.*)')

In [4]:
m = p.match('0.5 ( 1 ounce ) package dry ranch - style dressing mix')

In [12]:
for idx, match in enumerate(m.groups()):
    print(f'Group {idx}: {match}')

Group 0: 0.5
Group 1: ( 1 ounce )
Group 2: 1
Group 3: ounce
Group 4: package
Group 5: dry ranch - style dressing mix


In [13]:
import pickle

with open('crf_big.pkl', 'rb') as f:
    model = pickle.load(f)


In [16]:
def word2featuresmod(sent, i):
    word = sent[i].text
    postag = sent[i].tag_
    
    # data structure consisting of a feature name and value for the token
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(), # lower case variant of the token
        'word[-3:]': word[-3:], #suffix of 3 characters
        'word[-2:]': word[-2:], #suffix of 2 characters
        'word.isupper()': word.isupper(), # initial captial
        'word.istitle()': word.istitle(), # all words ini caps
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2], #first two characters of the PoS Tag
    }
    if i > 0:
        # adding features for the word based on the previous word
        word1 = sent[i-1].text # previous word
        postag1 = sent[i-1].tag_
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True # Beginning of sentence as a feature

    if i < len(sent)-1:
        # adding features for the word based on the next word
        word1 = sent[i+1].text # next word
        postag1 = sent[i+1].tag_
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True # end of sentence as a feature

    return features

def sent2featuresmod(sent):
    return [word2featuresmod(sent, i) for i in range(len(sent))]

def sent2labelsmod(sent):
    return [x.NER_tags for x in sent]

def sent2tokensmod(sent):
    return [token for token, postag, label in sent]

In [21]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [27]:
text = "mix the mustard mayo through the cooked potatoes with the spring onions, apple and celery"

doc = nlp(text)

tokens = [token for token in doc]

prep = sent2featuresmod(tokens)
# prep

predict = model.predict([prep])

for idx, token in enumerate(tokens):
    print(token, predict[0][idx])

mix O
the O
mustard O
mayo B-ING
through O
the O
cooked O
potatoes B-ING
with O
the O
spring B-ING
onions I-ING
, O
apple B-ING
and I-ING
celery I-ING
