In [9]:
import apache_beam as beam

# Define the regex pattern to match emoji, name, and type
regex = r'(?P<icon>[^\s,]+),\s*(?P<name>\w+),\s*(?P<type>\w+)'

with beam.Pipeline() as pipeline:
    plants_matches = (
        pipeline
        | 'Garden plants' >> beam.Create([
            '🍓, Strawberry, perennial',
            '🥕, Carrot, biennial ignoring trailing words',
            '🍆, Eggplant, perennial',
            '🍅, Tomato, annual',
            '🥔, Potato, perennial',
            '# 🍌, invalid, format',
            'invalid, 🍉, format',
        ])
        | 'Parse plants' >> beam.Regex.matches(regex)
        | beam.Map(print)
    )


🍓, Strawberry, perennial
🥕, Carrot, biennial
🍆, Eggplant, perennial
🍅, Tomato, annual
🥔, Potato, perennial


In [10]:
import apache_beam as beam

# Define the regex pattern to match emoji, name, and type
regex = r'(?P<icon>[^\s,]+),\s*(?P<name>\w+),\s*(?P<type>\w+)'

with beam.Pipeline() as pipeline:
    plants_all_matches = (
        pipeline
        | 'Garden plants' >> beam.Create([
            '🍓, Strawberry, perennial',
            '🥕, Carrot, biennial ignoring trailing words',
            '🍆, Eggplant, perennial',
            '🍅, Tomato, annual',
            '🥔, Potato, perennial',
            '# 🍌, invalid, format',
            'invalid, 🍉, format',
        ])
        | 'Parse plants' >> beam.Regex.all_matches(regex)
        | beam.Map(print)
    )


['🍓, Strawberry, perennial', '🍓', 'Strawberry', 'perennial']
['🥕, Carrot, biennial', '🥕', 'Carrot', 'biennial']
['🍆, Eggplant, perennial', '🍆', 'Eggplant', 'perennial']
['🍅, Tomato, annual', '🍅', 'Tomato', 'annual']
['🥔, Potato, perennial', '🥔', 'Potato', 'perennial']


In [11]:
def regex_matches_kv(test=None):
    # [START regex_matches_kv]
    import apache_beam as beam

    # Matches a named group 'icon', and then two comma-separated groups.
    regex = r'(?P<icon>[^\s,]+), *(\w+), *(\w+)'
    with beam.Pipeline() as pipeline:
        plants_matches_kv = (
            pipeline
            | 'Garden plants' >> beam.Create([
                '🍓, Strawberry, perennial',
                '🥕, Carrot, biennial ignoring trailing words',
                '🍆, Eggplant, perennial',
                '🍅, Tomato, annual',
                '🥔, Potato, perennial',
                '# 🍌, invalid, format',
                'invalid, 🍉, format',
            ])
            | 'Parse plants' >> beam.Regex.matches_kv(regex, keyGroup='icon')
            | beam.Map(print))
    # [END regex_matches_kv]

regex_matches_kv()

('🍓', '🍓, Strawberry, perennial')
('🥕', '🥕, Carrot, biennial')
('🍆', '🍆, Eggplant, perennial')
('🍅', '🍅, Tomato, annual')
('🥔', '🥔, Potato, perennial')


In [12]:
# [START regex_find]
import apache_beam as beam

# Matches a named group 'icon', and then two comma-separated groups.
regex = r'(?P<icon>[^\s,]+), *(\w+), *(\w+)'
with beam.Pipeline() as pipeline:
    plants_matches = (
        pipeline
        | 'Garden plants' >> beam.Create([
            '# 🍓, Strawberry, perennial',
            '# 🥕, Carrot, biennial ignoring trailing words',
            '# 🍆, Eggplant, perennial - 🍌, Banana, perennial',
            '# 🍅, Tomato, annual - 🍉, Watermelon, annual',
            '# 🥔, Potato, perennial',
        ])
        | 'Parse plants' >> beam.Regex.find(regex)
        | beam.Map(print))
# [END regex_find]


🍓, Strawberry, perennial
🥕, Carrot, biennial
🍆, Eggplant, perennial
🍅, Tomato, annual
🥔, Potato, perennial


In [13]:
import apache_beam as beam

# Matches a named group 'icon', and then two comma-separated groups.
regex = r'(?P<icon>[^\s,]+), *(\w+), *(\w+)'
with beam.Pipeline() as pipeline:
    plants_find_all = (
        pipeline
        | 'Garden plants' >> beam.Create([
            '# 🍓, Strawberry, perennial',
            '# 🥕, Carrot, biennial ignoring trailing words',
            '# 🍆, Eggplant, perennial - 🍌, Banana, perennial',
            '# 🍅, Tomato, annual - 🍉, Watermelon, annual',
            '# 🥔, Potato, perennial',
        ])
        | 'Parse plants' >> beam.Regex.find_all(regex)
        | beam.Map(print))
# [END regex_find_all]


['🍓, Strawberry, perennial']
['🥕, Carrot, biennial']
['🍆, Eggplant, perennial', '🍌, Banana, perennial']
['🍅, Tomato, annual', '🍉, Watermelon, annual']
['🥔, Potato, perennial']


In [14]:
import apache_beam as beam

# Matches a named group 'icon', and then two comma-separated groups.
regex = r'(?P<icon>[^\s,]+), *(\w+), *(\w+)'
with beam.Pipeline() as pipeline:
    plants_matches_kv = (
        pipeline
        | 'Garden plants' >> beam.Create([
            '# 🍓, Strawberry, perennial',
            '# 🥕, Carrot, biennial ignoring trailing words',
            '# 🍆, Eggplant, perennial - 🍌, Banana, perennial',
            '# 🍅, Tomato, annual - 🍉, Watermelon, annual',
            '# 🥔, Potato, perennial',
        ])
        | 'Parse plants' >> beam.Regex.find_kv(regex, keyGroup='icon')
        | beam.Map(print))


('🍓', '🍓, Strawberry, perennial')
('🥕', '🥕, Carrot, biennial')
('🍆', '🍆, Eggplant, perennial')
('🍌', '🍌, Banana, perennial')
('🍅', '🍅, Tomato, annual')
('🍉', '🍉, Watermelon, annual')
('🥔', '🥔, Potato, perennial')


In [16]:
import apache_beam as beam

with beam.Pipeline() as pipeline:
    plants_replace_all = (
        pipeline
        | 'Garden plants' >> beam.Create([
            '🍓 : Strawberry : perennial',
            '🥕 : Carrot : biennial',
            '🍆\t:\tEggplant\t:\tperennial',
            '🍅 : Tomato : annual',
            '🥔 : Potato : perennial',
        ])
        | 'To CSV' >> beam.Regex.replace_all(r'\s*:\s*', ',')
        | beam.Map(print)
    )


🍓,Strawberry,perennial
🥕,Carrot,biennial
🍆,Eggplant,perennial
🍅,Tomato,annual
🥔,Potato,perennial
