In [None]:
'''
[]	A set of characters	"[a-m]"	
\	Signals a special sequence (can also be used to escape special characters)	"\d"	
.	Any character (except newline character)	"he..o"	
^	Starts with	"^hello"	
$	Ends with	"planet$"	
*	Zero or more occurrences	"he.*o"	
+	One or more occurrences	"he.+o"	
?	Zero or one occurrences	"he.?o"	
{}	Exactly the specified number of occurrences	"he.{2}o"	
|	Either or	"falls|stays"	
()	Capture and group

Expressions &	Explanations
\w 
Matches alphanumeric characters, that is a-z, A-Z, 0-9, and underscore(_)
\W
Matches non-alphanumeric characters, that is except a-z, A-Z, 0-9 and _
\d
Matches digits, from 0-9.
\D 
Matches any non-digits.
\s
Matches whitespace characters, which also include the \t, \n, \r, and space characters.
\S
Matches non-whitespace characters.
\A
Matches the expression to its right at the absolute start of a string whether in single or multi-line mode.
\Z 
Matches the expression to its left at the absolute end of a string whether in single or multi-line mode.
\n
Matches a newline character
\t
Matches tab character
\b
Matches the word boundary (or empty string) at the start and end of a word.
\B
Matches where \b does not, that is, non-word boundary
'''

In [1]:
import re
import warnings
warnings.filterwarnings("ignore")
from functools import reduce

In [2]:
if re.search("ape", "The ape was at the apex"):
    print("Found a match")

Found a match


In [5]:
all_apes = re.findall("ape", "The ape was at the apex")
for i in all_apes:
    print(i)

ape
ape


In [6]:
the_str = "The ape was at the apex"
for i in re.finditer("ape.", the_str):
    loc_tup = i.span() # return a tuple of start and end positions
    print(loc_tup)
    print(the_str[loc_tup[0]:loc_tup[1]])

(4, 8)
ape 
(19, 23)
apex


In [8]:
animal_str = "Cat rat mat fat pat"
all_animals = re.findall("[crmfp]at", animal_str)
for i in all_animals:
    print(i)

rat
mat
fat
pat


In [9]:
some_animals = re.findall("[c-mC-M]at", animal_str)
for i in some_animals:
    print(i)

Cat
mat
fat


In [10]:
all_animals = re.findall("[^Cr]at", animal_str) # Find all animals that are not cats or rats
for i in all_animals:
    print(i)

mat
fat
pat


In [13]:
owl_food = "rat cat mat pat"
regex = re.compile("[cm]at")
owl_food = regex.sub("owl", owl_food)
print(owl_food) 

rat owl owl pat


In [17]:
rand_str = "Here is \\stuff"
# print("Find \\stuff: ", re.search("\\\\stuff", rand_str))

print("Find \\stuff: ", re.search(r"\\stuff", rand_str))

Find \stuff:  <re.Match object; span=(8, 14), match='\\stuff'>


In [2]:
rnd_str = "F.B.I. I.R.S. CIA"
print(re.findall(r".\..\..", rnd_str))

['F.B.I', 'I.R.S']


In [8]:
rand_str = '''This is a
long string
of 3 lines'''

print(rand_str)
regex = re.compile("\n")
rand_str = regex.sub("  ", rand_str)
print(rand_str)

# \b \f \r \t \v \r\n

This is a
long string
of 3 lines
This is a  long string  of 3 lines


In [11]:
rnd_str = "12345"
print("Matches :", len(re.findall("\d", rnd_str)))

Matches : 5


In [12]:
rnd_str = "12345"
if re.search("\d{5}", rnd_str):
    print("It is zip code")

It is zip code


In [15]:
rnd = "123 12345 123456 1234567"
print("Matches: ", len(re.findall("\d{5,7}", rnd)))

Matches:  3


In [13]:
phone_str = "123-555-1234, 456.555.4321, (789)555-9876"

phone_pattern = re.compile(r"(\d{3}[-\.]\d{3}[-\.]\d{4}|\(\d{3}\)\d{3}[-\.]\d{4})")
phone_numbers = phone_pattern.findall(phone_str)

for number in phone_numbers:
    print(f"Valid phone number found: {number}")

test_number = "123-555-1234"
if phone_pattern.match(test_number):
    print(f"{test_number} is a valid phone number format")

Valid phone number found: 123-555-1234
Valid phone number found: 456.555.4321
Valid phone number found: (789)555-9876
123-555-1234 is a valid phone number format


In [18]:
ph = "123-1234-123"
if re.search("\w{3}-\w{4}-\w{3}", ph):
    print("Phone number is valid")
if re.search("\w{2,20}", "0xArchit"):
    print("Name is valid")

Phone number is valid
Name is valid


In [22]:
if re.search("\w{2,20}\s\w{2,20}\s\w{2,10}\s\w{2,5}", "0xarchit AkA Archit Jain"):
    print("Match found")

Match found


In [26]:
print("Matches: ", len(re.findall("a+", "a as has bug")))

Matches:  3


In [11]:
email_list = "db@aol.com m@.com @apple.com db@.com"
print("Email Matches :", len(re.findall("[\w._%+-]{1,20}@[\w.-]{2,20}.[A-Za-z]{2,3}", email_list)))

Email Matches : 1


In [5]:
rnd = "cat cats"
regex = re.compile("[cat]+s?")
print(re.findall(regex, rnd))

['cat', 'cats']


In [9]:
rnd = "doctor doctors doctor's"
# regex = re.compile("[doctor]+\\'?+s?")
regex = re.compile("[doctor]+['s]*") # or use {0,2} in place of *
print(re.findall(regex, rnd))

['doctor', 'doctors', "doctor's"]


In [15]:
long = '''Just some
words for \r
long string'''

print(f'Matches: , {len(re.findall(r"[\w\s]+[\r]?\n", long))}\n{re.findall(r"[\w\s]+[\r]?\n", long)}')

Matches: , 1
['Just some\nwords for \r\n']


In [20]:
# Greedy and lazy matching

rnd = "<name>Life on Mars</name><name>Freaks and Greeks</name>"
# regex = re.compile(r"<name>.*</name>")
regex = re.compile(r"<name>.*?</name>")
# regex = re.compile(r"<name>.+?</name>")
print(*re.findall(regex, rnd))

<name>Life on Mars</name> <name>Freaks and Greeks</name>


In [3]:
rand_str = "ape at the apex"
regex = re.compile(r"ape")
regex2 = re.compile(r"\bape\b")
print(*re.findall(regex, rand_str))
print(*re.findall(regex2, rand_str))

ape ape
ape


In [10]:
# rnd = "match everything up to @"
# rnd = "@ get this string"
rnd = '''Ape is big
Turtle is slow
Cheetah is fast'''
# regex = re.compile(r"^.*[^@]")
# regex = re.compile(r"[^@\s].*$")
regex = re.compile(r"(?m)^.*?\s")
print(*re.findall(regex, rnd))

Ape  Turtle  Cheetah 


In [11]:
rnd = 'Phone number of someone 987-654-3210'
regex = re.compile(r"987-(.*)")
print(*re.findall(regex, rnd))

654-3210


In [13]:
rnd = '987-654-3210 987-123-4567 987-567-1234'
regex = re.compile(r"987-(.{8})")
print(*re.findall(regex, rnd))

654-3210 123-4567 567-1234


In [14]:
rnd = '987-654-3210'
regex = re.compile(r"987-(.*)-(.*)")
print(*re.findall(regex, rnd))

('654', '3210')


In [17]:
rnd_str = "The cat cat fell out the window"
regex = re.compile(r"(\b\w+)\s+\1")
print(re.findall(regex, rnd_str))

['cat']


In [None]:
# Did you find a match
 # if re.search("REGEX", my_string)
 # Get list of matches
 # print("Matches :", len(re.findall("REGEX", my_string)))
 # Get a pattern object
 # regex = re.compile("REGEX")
 # Substitute the match
 # my_string = regex.sub("substitution", my_string)
 # [ ]   : Match what is in the brackets
 # [^ ]  : Match anything not in the brackets
 # ( )   : Return surrounded submatch
 # .     : Match any 1 character or space
 # +     : Match 1 or more of what proceeds
 # ?     : Match 0 or 1
 # *     : Match 0 or More
 # *?    : Lazy match the smallest match
 # \b    : Word boundary
 # ^     : Beginning of String
 # $     : End of String
 # \n    : Newline
 # \d    : Any 1 number
 # \D    : Anything but a number
 # \w    : Same as [a-zA-Z0-9_]
 # \W    : Same as [^a-zA-Z0-9_]
 # \s    : Same as [\f\n\r\t\v]
 # \S    : Same as [^\f\n\r\t\v]
 # {5}   : Match 5 of what proceeds the curly brackets
 # {5,7} : Match values that are between 5 and 7 in length
 # ($m)  : Allow ^ on multiline string

In [20]:
rnd_str = "<a href='#'><b>The Link</b></a>"
regex = re.compile(r"<b>(.*?)</b>")
rnd_str = re.sub(regex, r"\1", rnd_str)

print(rnd_str)

<a href='#'>The Link</a>


In [27]:
rnd_str = "412-555-1212"
regex = re.compile(r"([\d]{3})-([\d]{3})-([\d]{4})")
# (412)555-1212
rnd_str = re.sub(regex, r"(\1)\2-\3", rnd_str)

print(rnd_str)

(412)555-1212


In [28]:
rand_str = "412-555-1212"
regex = re.compile(r"([\d]{3})-([\d]{3}-[\d]{4})")
rand_str = re.sub(regex, r"(\1)\2", rand_str)
print(rand_str)

(412)555-1212


In [32]:
rand_str = "https://www.youtube.com http://www.google.com"
# out:
# <a href='https://www.youtube.com'>www.youtube.com</a>
#  <a href='https://www.google.com'>www.google.com</a>

regex = re.compile(r"(https?://([\w.]+))")
rnd = re.sub(regex, r"<a href='\1'>\2</a>\n", rand_str)

print(rnd)


<a href='https://www.youtube.com'>www.youtube.com</a>
 <a href='http://www.google.com'>www.google.com</a>



In [34]:
rand_str = "one two three four"
regex = re.compile(r"\w+(?=\b)")
matches = re.findall(regex, rand_str)
print(*matches)

one two three four


In [36]:
rand_str = "1. Bread 2. Apples 3. Lettuce"
regex = re.compile(r"(?<=\d.\s)\w+")
matches = re.findall(regex, rand_str)
for i in matches:
    print(i)

Bread
Apples
Lettuce


In [37]:
rand_str = "<h1>I'm Important</h1> <h1>So am I</h1>"
regex = re.compile(r"(?<=<h1>).+?(?=</h1>)")
matches = re.findall(regex, rand_str)
for i in matches:
    print(i)

I'm Important
So am I


In [40]:
rand_str = "8 Apples $3, 1 Bread $1, 1 Cereal $4"
regex = re.compile(r"(?<!\$)\d+")
matches = re.findall(regex, rand_str)
print(len(matches))
matches = [int(i) for i in matches]
print("Total Items {}".format(reduce((lambda x, y: x + y), matches)))

3
Total Items 10


In [None]:
# [ ]   : Match what is in the brackets
 # [^ ]  : Match anything not in the brackets
 # ( )   : Return surrounded submatch
 # .     : Match any 1 character or space
 # +     : Match 1 or more of what proceeds
 # ?     : Match 0 or 1
 # *     : Match 0 or More
 # *?    : Lazy match the smallest match
 # \b    : Word boundary
 # ^     : Beginning of String
 # $     : End of String
 # \n    : Newline
 # \d    : Any 1 number
 # \D    : Anything but a number
 # \w    : Same as [a-zA-Z0-9_]
 # \W    : Same as [^a-zA-Z0-9_]
 # \s    : Same as [\f\n\r\t\v]
 # \S    : Same as [^\f\n\r\t\v]
 # {5}   : Match 5 of what proceeds the curly brackets
 # {5,7} : Match values that are between 5 and 7 in length
 # ($m)  : Allow ^ on multiline string
 # Use a back reference to substitute what is between the
 # bold tags and eliminate the bold tags
 # re.sub(r"<b>(.*?)</b>", r"\1", randStr)
 # Use a look ahead to find all characters of 1 or more
 # with a word boundary, but don't return the word
 # boundary
 # re.findall(r"\w+(?=\b)", randStr)
 # Use a look behind to find words starting with a number,
 # period and space, but only return the word that follows
 # re.findall(r"(?<=\d.\s)\w+", randStr)
 # Use a negative look behind to only return numbers without
 # a $ in front of them
 # re.findall(r"(?<!\$)\d+", randStr)

In [2]:
rnd_str = "1. Dog 2. Cat 3. Turtle"
regex = re.compile(r"\d\.\s(Dog|Cat)")
print(*re.findall(regex, rnd_str))

Dog Cat


In [7]:
rnd = "1234 12345-1234 1234 12346-333"
regex = re.compile(r"(\d{5}-\d{3,4})")
print(*re.findall(regex, rnd))

12345-1234 12346-333


In [16]:
bd = input("Enter your birthday (mm-dd-yyyy) : ")
bd_regex = re.search(r"(\d{1,2})-(\d{1,2})-(\d{1,4})", bd)
print("You were born on", bd_regex.group())
print("Birth Month", bd_regex.group(1))
print("Birth Day", bd_regex.group(2))
print("Birth Year", bd_regex.group(3))

You were born on 12-12-12
Birth Month 12
Birth Day 12
Birth Year 12


In [17]:
match = re.search(r"\d{2}", "The chicken weighed 13 lbs")
print("Match :", match.group())
print("Span :", match.span())
print("Match :", match.start())
print("Match :", match.end())

Match : 13
Span : (20, 22)
Match : 20
Match : 22


In [19]:
# named group

rand_str = "December 69 2069"
regex = r"^(?P<month>\w+)\s(?P<day>\d+)\s(?P<year>\d+)"
matches = re.search(regex, rand_str)
print("Month :", matches.group('month'))
print("Day :", matches.group('day'))
print("Year :", matches.group('year'))

Month : December
Day : 69
Year : 2069


In [23]:
rand_str = "d+b@aol.com a_1@yahoo.co.uk A-100@m-b.INTERNATIONAL"
regex = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
matches = re.findall(regex, rand_str)
print(len(matches))
print(*matches)

3
d+b@aol.com a_1@yahoo.co.uk A-100@m-b.INTERNATIONAL


In [28]:
rand_str = "14125551212 4125551212 (412)5551212 412 555 1212 412-555-1212 1-412-555-1212"
regex = re.compile(r"((1?)(-| ?)(\()?(\d{3})(\)|-| |\)-|\) )?(\d{3})(-| )?(\d{4}|\d{4}))")
matches = re.findall(regex, rand_str)
print(len(matches))
for i in matches:
    print(i[0].lstrip())

6
14125551212
4125551212
(412)5551212
412 555 1212
412-555-1212
1-412-555-1212
