# Regular Expressions

In [1]:
import re

# Create patterns

In [3]:
pat = re.compile('Ola', flags=re.IGNORECASE)
pat

re.compile(r'Ola', re.IGNORECASE|re.UNICODE)

# Mathching patterns

In [16]:
match = pat.match('Ola amigo') # returns start and stop index of the matched sub string. This only works if the text to be matched is at the beginning of the string
match

<re.Match object; span=(0, 3), match='Ola'>

In [17]:
match.span()

(0, 3)

In [18]:
pat.match(' Por favor Ola amigo') # returns none

In [19]:
pat.search('Por favor Ola amigo') # same as match but can return index even if the string to be matched is at any position in the string

<re.Match object; span=(10, 13), match='Ola'>

In [20]:
pat.findall('Ola amigo') # returns the matched sub string

['Ola']

In [12]:
[i for i in pat.finditer('Ola amigo')] # return an iterator

[<re.Match object; span=(0, 3), match='Ola'>]

# Examples

In [21]:
text = 'This book costs $15'

# Only find the amount

In [28]:
re.findall('\$\d+', text)[0]

'$15'

# The backslash plague

In [33]:
directory = 'C:\\Program Files\\Microsoft'

In [34]:
re.findall(r'\\Program Files', directory)

['\\Program Files']

# this is raw string r''

In [36]:
re.escape("C:\\") # escape all the chars except ASCII letter, num and _

'C:\\\\'

'ok'

# Character class

### match single regex

In [38]:
text = 'I have a 2 wheelers lisence and I have a Win10 free license'
# find all lisence or license

In [39]:
re.findall(r'li[a-z]en[a-z]e',text)

['lisence', 'license']

# Alteration - Match multiple regex

In [1]:
import re

In [2]:
text = "The apple doesn't fall far from tree"

In [8]:
pattern = re.compile('the|apple|tree', flags=re.IGNORECASE)

In [9]:
re.findall(pattern, text)

['The', 'apple', 'tree']

### Find what is and who is1

In [44]:
text = 'What is this and who is this?'

In [45]:
pat = re.compile("(What|who) is")

In [46]:
re.findall(pat, text)

['What', 'who']

# Quantifiers

### Defines how a character, metachar and char set can be repeated

In [47]:
text = 'This is a dog and those are dogs'

In [50]:
re.findall('dog.', text)

['dog ', 'dogs']

In [53]:
re.findall('dogs*', text)

['dog', 'dogs']

In [55]:
text = 'file1.txt, file_one.txt. file.txt, file_.txt, fil.txt'

In [69]:
re.findall('file\w*\.txt', text)

['file1.txt', 'file_one.txt', 'file.txt', 'file_.txt']

In [70]:
re.findall('file\d+\.txt', text)

['file1.txt']

# Find year in the given text

In [71]:
text = 'I started my class 1 in 2001 and then I passed High School in 2013. My 1st sem started in 2013 and my 8th sem ended on 2017. My 1st job at Amazon was at 2017 and then at 2019 I started my DS journey'

In [72]:
re.findall('\d{4}', text)

['2001', '2013', '2013', '2017', '2017', '2019']

# Write a pattern to validate phone numbers

In [73]:
text = '555-555-555, 555555555, 555 555 555'

In [79]:
re.findall('\d{3}[\s-]?\d{3}[\s-]?\d{3}', text)

['555-555-555', '555555555', '555 555 555']

# Greedy Quantifiers

In [12]:
import re

In [13]:
text = "<html><title><p>TEXT<\p><\title><\html>"

In [14]:
re.findall('<\\\*\w+>', text)

['<html>', '<title>', '<p>', '<\\p>', '<\\html>']

In [17]:
re.findall('<.*?>', text)

['<html>', '<title>', '<p>', '<\\p>', '<\title>', '<\\html>']

# Find all occurances of and or and the in the text

In [18]:
text = "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum."

In [25]:
re.findall('\\b(and|or|the)\\b', text)

['the', 'and', 'the', 'the', 'and', 'the', 'the', 'the', 'and']

# \b metachar that helps us to match word boundary. We have to use \\b as \b is an escape seq in py and we also need to escape that using \\b

### ^ match at the beginning
### $ match at the end
### \b matches a word bounday
### \B doesn't -do-
### \A matches the beginning of the input
### \Z matches the end of the input

## Example 1

In [48]:
text = """
name:
age: 20
roll_no:123
grade:100

name: qqq
age: 20
roll_no:123 name: sdsd
grade:100

name: xyz
age: 20
roll_no:123
grade:100
"""

# Find any line that starts with the pattern name: space

In [51]:
re.findall('^name: .*', text, flags=re.M)

['name: qqq', 'name: xyz']

# re.M is multiline which is used to make ^$ consider new line

## Find all sentences which do not end with fullstop

In [87]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book.
It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged.
It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum!"""

In [88]:
re.findall('^.*[^\.]$',text, flags=re.M)

['It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum!']

# Split text

In [89]:
re.split('\n', text)

["Lorem Ipsum is simply dummy text of the printing and typesetting industry Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book.",
 'It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged.',
 'It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum!']

In [95]:
re.split('\s', text)

['Lorem',
 'Ipsum',
 'is',
 'simply',
 'dummy',
 'text',
 'of',
 'the',
 'printing',
 'and',
 'typesetting',
 'industry',
 'Lorem',
 'Ipsum',
 'has',
 'been',
 'the',
 "industry's",
 'standard',
 'dummy',
 'text',
 'ever',
 'since',
 'the',
 '1500s,',
 'when',
 'an',
 'unknown',
 'printer',
 'took',
 'a',
 'galley',
 'of',
 'type',
 'and',
 'scrambled',
 'it',
 'to',
 'make',
 'a',
 'type',
 'specimen',
 'book.',
 'It',
 'has',
 'survived',
 'not',
 'only',
 'five',
 'centuries,',
 'but',
 'also',
 'the',
 'leap',
 'into',
 'electronic',
 'typesetting,',
 'remaining',
 'essentially',
 'unchanged.',
 'It',
 'was',
 'popularised',
 'in',
 'the',
 '1960s',
 'with',
 'the',
 'release',
 'of',
 'Letraset',
 'sheets',
 'containing',
 'Lorem',
 'Ipsum',
 'passages,',
 'and',
 'more',
 'recently',
 'with',
 'desktop',
 'publishing',
 'software',
 'like',
 'Aldus',
 'PageMaker',
 'including',
 'versions',
 'of',
 'Lorem',
 'Ipsum!']

In [97]:
list(filter(lambda x: x!='',re.split('\W', text)))

['Lorem',
 'Ipsum',
 'is',
 'simply',
 'dummy',
 'text',
 'of',
 'the',
 'printing',
 'and',
 'typesetting',
 'industry',
 'Lorem',
 'Ipsum',
 'has',
 'been',
 'the',
 'industry',
 's',
 'standard',
 'dummy',
 'text',
 'ever',
 'since',
 'the',
 '1500s',
 'when',
 'an',
 'unknown',
 'printer',
 'took',
 'a',
 'galley',
 'of',
 'type',
 'and',
 'scrambled',
 'it',
 'to',
 'make',
 'a',
 'type',
 'specimen',
 'book',
 'It',
 'has',
 'survived',
 'not',
 'only',
 'five',
 'centuries',
 'but',
 'also',
 'the',
 'leap',
 'into',
 'electronic',
 'typesetting',
 'remaining',
 'essentially',
 'unchanged',
 'It',
 'was',
 'popularised',
 'in',
 'the',
 '1960s',
 'with',
 'the',
 'release',
 'of',
 'Letraset',
 'sheets',
 'containing',
 'Lorem',
 'Ipsum',
 'passages',
 'and',
 'more',
 'recently',
 'with',
 'desktop',
 'publishing',
 'software',
 'like',
 'Aldus',
 'PageMaker',
 'including',
 'versions',
 'of',
 'Lorem',
 'Ipsum']

In [101]:
re.split('\s',text, maxsplit=3)

['Lorem',
 'Ipsum',
 'is',
 "simply dummy text of the printing and typesetting industry Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book.\nIt has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged.\nIt was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum!"]

# [^0-9] means chars should not be between 0 - 9

# Find non alpha numeric and non white space

In [40]:
text = 'This is spartan400 and   pop'

In [49]:
re.findall('[^\w\s]', text)

[]

# Removing numbers

In [1]:
import re
def remove_num(text: str) -> str:
    text = re.sub(r'\d+','', text)
    return text

In [2]:
remove_num("I have 32 houses")

'I have  houses'

# Replace digit with their string counterparts 

In [5]:
import inflect
q = inflect.engine()


In [34]:
def convert_num_to_string(text: str) -> list:
    split_text = text.split()
    for i in split_text:
        if i.isdigit() == True:
            split_text[split_text.index(i)] = q.number_to_words(i)
        else:
            continue
    return ' '.join(split_text)

In [35]:

convert_num_to_string('This is spartan 300 and 40')

'This is spartan three hundred and forty'

# Remove punctuation

In [50]:
import string
def remove_punc(text: str) -> str:
# This uses the 3-argument version of str.maketrans
# with arguments (x, y, z) where 'x' and 'y'
# must be equal-length strings and characters in 'x'
# are replaced by characters in 'y'. 'z'
# is a string (string.punctuation here)
# where each character in the string is mapped
# to None
    translator = str.maketrans('','',string.punctuation)
    return text.translate(translator)

In [51]:
remove_punc('Hi! there')

'Hi there'