In [1]:
import math 

In [2]:
math.comb(10, 3)

120

In [3]:
math.perm(10,3)

720

In [4]:
math.lcm(10,20,40)

40

In [5]:
math.factorial(5)

120

In [6]:
math.sqrt(21)

4.58257569495584

In [7]:
math.pi

3.141592653589793

In [8]:
math.ceil(10.09)

11

In [9]:
math.floor(10.99)

10

In [10]:
math.pow(2, 5)

32.0

In [11]:
math.radians(45)

0.7853981633974483

#### Regular expressions (regex) are **pattern-matching tools** that help you find and clean text data. Think of regex as a way to describe what you're looking for in text.

## Why You Need Regex in Data Science

- **Find specific text** in messy data
- **Clean inconsistent data** formats
- **Extract information** from text automatically

In [13]:
import re

In [19]:
txt = "The rain in Spain"

x = re.search("^The.*Spain$", txt)

print(x)

<re.Match object; span=(0, 17), match='The rain in Spain'>


In [21]:
txt = "The rain in AI Spain"


x = re.findall("ai", txt)
print(x)

['ai', 'ai']


In [22]:
txt = "The rain in Spain"

x = re.search("\s", txt)


In [23]:
x

<re.Match object; span=(3, 4), match=' '>

In [24]:
print("The first white-space character is located in position:", x.start())

The first white-space character is located in position: 3


In [25]:
txt = "The rain in Spain"
x = re.split("\s", txt)
print(x)


['The', 'rain', 'in', 'Spain']


In [29]:
txt = "The rain in Spain"
x = re.split("\s", txt, 1)
print(x)

['The', 'rain in Spain']


In [30]:
txt = "The rain in Spain"

x = re.sub("\s", "9", txt)
print(x)

The9rain9in9Spain


In [31]:
txt = "The rain in Spain"
x = re.sub("\s", "9", txt, 2)
print(x)

The9rain9in Spain


In [32]:
txt = "The rain in Spain"


x = re.search(r"\bS\w+", txt)
print(x.span())

# Raw String = r"\bS\w+"

(12, 17)


In [36]:
txt = "The rain in Spain"

x = re.search(r"\bS\w+", txt)
print(x.string)

The rain in Spain


In [34]:
txt = "The rain in Spain"

x = re.search(r"\bS\w+", txt)

print(x.group())

Spain


In [38]:
text = "Contact us at support@example.com or sales@mydomain.org, sales123@mydomain.org"
emails = re.findall(r'\b[\w.-]+@[\w.-]+\.\w+\b', text)
print(emails)

# r'\b\w+@\w+\.\w+'
# OR
# '\\b\\w+@\\w+\\.\\w+' r"\b[\w.-]+@[\w.-]+\.\w+\b"

['support@example.com', 'sales@mydomain.org', 'sales123@mydomain.org']


In [41]:
# Validate if a String is a Strong Password
password = "My$tr0ngP@ss"
pattern = r'^(?=.*[A-Z])(?=.*[a-z])(?=.*\d)(?=.*[@$#]).{8,}$'
print(bool(re.match(pattern, password)))

#Checks for uppercase, lowercase, digit, special character, and min length.

True


| Regex Part    | Meaning                                                      |
| ------------- | ------------------------------------------------------------ |
| `^`           | Start of the string                                          |
| `(?=.*[A-Z])` | Must contain **at least one uppercase** letter               |
| `(?=.*[a-z])` | Must contain **at least one lowercase** letter               |
| `(?=.*\d)`    | Must contain **at least one digit (0–9)**                    |
| `(?=.*[@$#])` | Must contain **at least one special character** from `@ $ #` |
| `.{8,}`       | Must be **at least 8 characters long**                       |
| `$`           | End of the string                                            |


In [50]:
# Match exact text
pattern = r"data"  # Finds the word "data"

text = "data science"

result = re.search(pattern, text)  # Found!
print(result)

<re.Match object; span=(0, 4), match='data'>


In [51]:
# \d = any number (0-9)
pattern = r"\d"     # Finds any single digit

text = "Price: 123"

result = re.findall(r"\d", text)  # ['1', '2', '3']

# \w = any letter, number, or underscore
pattern = r"\w"     # Finds word characters

# \s = any space
pattern = r"\s"     # Finds spaces

print(result)

['1', '2', '3']


In [52]:
# + means "one or more"
pattern = r"\d+"    # Finds one or more digits together

text = "Price: 123"

result = re.findall(pattern, text)  # ['123']

# * means "zero or more"
pattern = r"\d*"    # Finds zero or more digits

print(result)

['123']


In [53]:
text = "My email is john@email.com and phone is 555-1234"

# 1. Find first match
match = re.search(r"\d+", text)

print(match.group())  # "555"


555


In [54]:
# 2. Find all matches  
all_numbers = re.findall(r"\d+", text)
print(all_numbers)  # ['555', '1234']

['555', '1234']


In [55]:
# Problem: Get prices from messy text
text = "Item costs $29.99, was $45.5056"

# Solution: Find all numbers with decimal points
prices = re.findall(r"\d+\.\d+", text)

print(prices)  # ['29.99', '45.50']

['29.99', '45.5056']


In [56]:
# Problem: Phone numbers in different formats
phones = ["(555) 123-4567", "555-123-4567", "5551234567"]

# Solution: Extract just the digits
clean_phones = []
for i in phones:
    digits = re.findall(r"\d", i)
    clean_phone = "".join(digits)
    clean_phones.append(clean_phone)

print(clean_phones)  # ['5551234567', '5551234567', '5551234567']

['5551234567', '5551234567', '5551234567']


In [58]:
# Problem: Extract emails from text
text = "Contact us at support123@company.com or sales@business.org"

# Solution: Simple email pattern
emails = re.findall(r"\w+@\w+\.\w+", text)
print(emails)  # ['support@company.com', 'sales@business.org']

['support123@company.com', 'sales@business.org']


In [59]:
# Problem: Find dates in format YYYY-MM-DD
text = "Events on 2023-10-15 and 2023-11-20"

# Solution: Pattern for date format
dates = re.findall(r"\d{4}-\d{2}-\d{2}", text)
print(dates)  # ['2023-10-15', '2023-11-20']

['2023-10-15', '2023-11-20']
