Regular Expressions (Regex) in Python are used for pattern matching and searching within strings. The re module in Python provides functions to work with regex. Regex allows us to specify complex search patterns for text, making it powerful for tasks like string validation, extraction, or substitution.

In [None]:
import re

# Basic Regex Patterns


In [None]:
# [abc] : a or b or c.
# [^abc] : Except a/b/c.
# [0-9] : 0 to 9.
# [a-z] : a to z.
# [A-Z] : A to Z.
# [a-z A-Z] : a to z or A to Z

# ^[A] : start with a.
# [A]$ : end with a.

# ## Quantifiers
# []? : occurs 0 or 1 time.
# []+ : occurs 1 or more times.
# []* : occurs 0 or more times.
# []{n} : occurs n time.
# []{n,} : occurs n or more times.
# []{n,m} : occurs atlease n times but less than m times.

# ## Metacharectors
# \d : [0-9]
# \D : [^0-9]
# \w : [a-z A-Z 0-9]
# \W : [^a-z A-Z 0-9]

# re.search(): Finds the first match.
# re.findall(): Finds all matches.
# re.sub(): Replaces matched patterns.

# Grouping: Extracts specific parts of a match using parentheses.
# Flags: Modify regex behavior (e.g., case-insensitivity, multiline matching).

In [None]:
# .  : Matches any character except a newline.
# ^  : Matches the start of a string.
# $  : Matches the end of a string.
# []  : A set of characters. For example, [a-z] matches any lowercase letter.
# \d  : Matches any digit (equivalent to [0-9]).
# \w  : Matches any word character (alphanumeric + underscore).
# \s  : Matches any whitespace character (spaces, tabs).
# *  : Matches 0 or more repetitions of the preceding pattern.
# +  : Matches 1 or more repetitions of the preceding pattern.
# ?  : Matches 0 or 1 occurrence of the preceding pattern.

In [None]:
# re.search(pattern, string):

# Searches for the first occurrence of a pattern in the string.
# Returns a match object if found, else None.

match = re.search(r'\d+', 'There are 42 apples and 30 mangoes')
print(match.group())  # Output: '42'

42


In [None]:
# re.findall(pattern, string):

# Returns a list of all matches in the string.

matches = re.findall(r'\b\w{4}\b', 'This is a test message')
print(matches)  # Output: ['This', 'test']

['This', 'test']


In [None]:
# re.match(pattern, string):

# Checks if the pattern matches the start of the string.

match = re.match(r'\w+', 'Hello World')
print(match.group())  # Output: 'Hello'

Hello


In [None]:
# re.split(pattern, string):

# Splits the string based on the pattern.

result = re.split(r'\s+', 'Split this sentence by spaces')
print(result)  # Output: ['Split', 'this', 'sentence', 'by', 'spaces']

['Split', 'this', 'sentence', 'by', 'spaces']


In [None]:
# re.sub(pattern, replacement, string):

# Replaces occurrences of the pattern with the replacement.

new_string = re.sub(r'apples', 'oranges', 'I like apples')
print(new_string)  # Output: 'I like oranges'

I like oranges


In [None]:
# Let's extract email addresses from a string:

text = "Contact us at support@example.com or sales@domain.org."

# Pattern to match an email address
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'

emails = re.findall(pattern, text)
print(emails)  # Output: ['support@example.com', 'sales@domain.org']


['support@example.com', 'sales@domain.org']


In [None]:
# Validating a Phone Number Format

phone_number = "123-456-7890"

# Pattern to match a phone number format (XXX-XXX-XXXX)
pattern = r'^\d{3}-\d{3}-\d{4}$'

if re.match(pattern, phone_number):
    print("Valid phone number")
else:
    print("Invalid phone number")

Valid phone number


In [None]:
# Grouping in Regex
# Grouping allows you to extract specific parts of a match.

text = "John was born on 1992-08-12."

# Extract the year, month, and day
pattern = r'(\d{4})-(\d{2})-(\d{2})'
match = re.search(pattern, text)

if match:
    print(f"Year: {match.group(1)}")  # Output: 1992
    print(f"Month: {match.group(2)}") # Output: 08
    print(f"Day: {match.group(3)}")   # Output: 12


Year: 1992
Month: 08
Day: 12


In [None]:
# Flags in Regex
# Flags modify the behavior of the pattern matching:

# re.IGNORECASE (re.I): Makes the pattern case-insensitive.
# re.MULTILINE (re.M): Allows ^ and $ to match the start and end of each line.
# re.DOTALL (re.S): Allows . to match newline characters as well.

pattern = r'^hello'
text = """hello world
hello again"""
matches = re.findall(pattern, text, re.MULTILINE)
print(matches)  # Output: ['hello', 'hello']


['hello', 'hello']
