In [1]:
from lazy_nlp_pipeline import NLP, Pattern as P, TokenPattern as TP

In [2]:
nlp = NLP(project_name='example_patterns')

In [4]:
# Sequence of tokens

pattern = P(
    TP('1'),
    TP('a'),
)


test_texts = [
    '1 a',
    'Something 1 a something',
    'Something Something2',
]

for span in nlp.match_patterns([pattern], texts=test_texts):
    print(span)

match_patterns: 100%|██████████████████████████████████████████████| 3/3 [00:00<00:00, 13301.18it/s]

Span('1 a')[0:3 doc=139779671233552]
Span('1 a')[10:13 doc=139779647797072]





In [5]:
# subpatterns

p1 = P(
    TP('1'),
    TP('2'),
)

pattern = P(
    p1,
    TP('a'),
    p1,
)

test_texts = [
    '1 2 a 1 2',
    'Something 1 2 a 1 2 something:::a:b',
    'Something Something2',
]
for span in nlp.match_patterns([pattern], texts=test_texts):
    print(span)

match_patterns: 100%|████████████████████████████████████████████████| 3/3 [00:00<00:00, 925.28it/s]

Span('1 2 a 1 2')[0:9 doc=139779647674704]
Span('1 2 a 1 2')[10:19 doc=139779665671440]





In [6]:
# allow_inbetween

pattern = P(
    TP('1'),
    TP('0'),
    TP('0'),
    
    allow_inbetween=TP(':'),
)

test_texts = [
    '1:0::0',
    'Something 1:::::0:0',
]
for span in nlp.match_patterns([pattern], texts=test_texts):
    print(span)

match_patterns: 100%|███████████████████████████████████████████████| 2/2 [00:00<00:00, 7503.23it/s]

Span('1:0::0')[0:6 doc=139779665376656]
Span('1:::::0:0')[10:19 doc=139779671096464]





In [7]:
# allow_inbetween=None

pattern = P(
    TP('e'),
    TP('2'),
    TP(' '),
    TP('e'),
    TP('4'),
    
    allow_inbetween=None,
)

test_texts = [
    'e2 e4',
    'e 2 e 4',
]
for span in nlp.match_patterns([pattern], texts=test_texts):
    print(span)

match_patterns: 100%|███████████████████████████████████████████████| 2/2 [00:00<00:00, 6748.68it/s]

Span('e2 e4')[0:5 doc=139779665673936]





In [8]:
# date example

ymd_date = P(
    TP(isnumeric=True, min_len=4, max_len=4),
    TP('-'),
    TP(isnumeric=True, min_len=2, max_len=2),
    TP('-'),
    TP(isnumeric=True, min_len=2, max_len=2),
    
    allow_inbetween=None,
)

dmy_date = P(
    TP(isnumeric=True, min_len=2, max_len=2),
    TP('-'),
    TP(isnumeric=True, min_len=2, max_len=2),
    TP('-'),
    TP(isnumeric=True, min_len=4, max_len=4),
    
    allow_inbetween=None,
)

pattern = P(
    TP('from', ignore_case=True)[0:1],
    ymd_date | dmy_date,
    TP('to', ignore_case=True),
    ymd_date | dmy_date,
)

test_texts = [
    '1999-01-10',
    'From 2001-01-10 to 2009-01-10',
    'Something 10-01-2001 to 2009-01-10 Something2',
]
for span in nlp.match_patterns([pattern], texts=test_texts):
    print(span)

match_patterns: 100%|███████████████████████████████████████████████| 3/3 [00:00<00:00, 2564.28it/s]

Span('From 2001-01-10 to 2009-01-10')[0:29 doc=139779647872912]
Span('2001-01-10 to 2009-01-10')[5:29 doc=139779647872912]
Span('10-01-2001 to 2009-01-10')[10:34 doc=139779647872080]





In [9]:
# Russian lemmatization

pattern = P(
    TP(lemma='общедоступный'),
    TP(isspace=False)[1:],
)

test_texts = [
    'Википедия (англ. Wikipedia) — общедоступная интернет-энциклопедия реализованная на принципах вики',
]
for span in nlp.match_patterns([pattern], texts=test_texts):
    print(span)

match_patterns: 100%|█████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.57it/s]

Span('общедоступная интернет')[30:52 doc=139779665672400]
Span('общедоступная интернет-')[30:53 doc=139779665672400]
Span('общедоступная интернет-энциклопедия')[30:65 doc=139779665672400]



