# **Regular expression**

In [1]:
import re

In [15]:
logs = """
    July 31 07:51:48 mycomputer bad_process[12345]: ERROR Performing package upgrade
    July 31 07:52:12 mycomputer good_process[67890]: INFO Process completed
    July 31 07:53:05 mycomputer bad_process[54321]: WARNING Disk almost full
"""
regex = r"\[(\d+)\]"
# r: raw string - means keeping all backslashes in regex as part of string, no escaping needed
# \[ and \]: mark the square bracket pair [] in string log
# (): defines a capturing group to extract the matched content
# \d+: matches and extracts one or more digits

# result[0]: [12345] - include the bracket in original string
# result[1]: 12345 - only the contents inside the bracket


search = re.search(regex, logs)
print(search[1] + "\n")      # Only return the first matched content

find_all = re.findall(regex, logs)
print(find_all)             # Find all matched content

find_iter = re.finditer(regex, logs)       # Return matched content with position
for m in find_iter:
    print(f"ID: {m.group(1)}, start: {m.start()}, end: {m.end()} \n")

sub = re.sub(regex, "[XXX]", logs)        # Change the content of matched content
print(sub + "\n")

split = re.split(r"\s+", logs)           # Split the strings inside
print(split[:10], "...")

12345

['12345', '67890', '54321']
ID: 12345, start: 44, end: 51 

ID: 67890, start: 130, end: 137 

ID: 54321, start: 205, end: 212 


    July 31 07:51:48 mycomputer bad_process[XXX]: ERROR Performing package upgrade
    July 31 07:52:12 mycomputer good_process[XXX]: INFO Process completed


['', 'July', '31', '07:51:48', 'mycomputer', 'bad_process[12345]:', 'ERROR', 'Performing', 'package', 'upgrade'] ...


In [2]:
import re

text = """
user1: john_doe123@gmail.com
user2: jane-doe@company.net
user3: bad_user@@example.com
Numbers: 123 4567 890
"""

# 1. ^ and $ -> match start/end of line
print(re.findall(r"^user\d", text, re.M))  # ['user1', 'user2', 'user3']
print(re.findall(r"\w+.com$", text, re.M))

# 2. \d, \D -> digits and non-digits
print(re.findall(r"\d+", text))  # ['1', '123', '2', '3', '4567', '890']
print(re.findall(r"\D+", text))  # all non-digit parts

# 3. \w, \W -> word char and non-word char
print(re.findall(r"\w+", text))  # words and numbers
print(re.findall(r"\W+", text))  # spaces, punctuation, symbols

# 4. \s, \S -> whitespace and non-whitespace
print(re.findall(r"\s+", text))  # spaces, newlines
print(re.findall(r"\S+", text))  # everything else

# 5. . -> any character except newline
print(re.findall(r"user.", text))  # user1, user2, user3

# 6. *, +, ?, {n}, {n,}, {n,m} -> repetition
print(re.findall(r"\d{2,3}", text))  # numbers with 2 or 3 digits
print(re.findall(r".+@.+\..+", text))  # crude email match

# 7. [], [^], [a-z], [0-9] -> character class
print(re.findall(r"[a-z]+", text))  # all lowercase sequences
print(re.findall(r"[^a-z\s]+", text))  # everything except lowercase letters and spaces

# 8. (), (?:), | -> groups and OR
emails = re.findall(r"(\w+[-_]?\w+@\w+\.\w+)", text)
print(emails)  # captured emails


['user1', 'user2', 'user3']
['gmail.com', 'example.com']
['1', '123', '2', '3', '123', '4567', '890']
['\nuser', ': john_doe', '@gmail.com\nuser', ': jane-doe@company.net\nuser', ': bad_user@@example.com\nNumbers: ', ' ', ' ', '\n']
['user1', 'john_doe123', 'gmail', 'com', 'user2', 'jane', 'doe', 'company', 'net', 'user3', 'bad_user', 'example', 'com', 'Numbers', '123', '4567', '890']
['\n', ': ', '@', '.', '\n', ': ', '-', '@', '.', '\n', ': ', '@@', '.', '\n', ': ', ' ', ' ', '\n']
['\n', ' ', '\n', ' ', '\n', ' ', '\n', ' ', ' ', ' ', '\n']
['user1:', 'john_doe123@gmail.com', 'user2:', 'jane-doe@company.net', 'user3:', 'bad_user@@example.com', 'Numbers:', '123', '4567', '890']
['user1', 'user2', 'user3', 'user@']
['123', '123', '456', '890']
['user1: john_doe123@gmail.com', 'user2: jane-doe@company.net', 'user3: bad_user@@example.com']
['user', 'john', 'doe', 'gmail', 'com', 'user', 'jane', 'doe', 'company', 'net', 'user', 'bad', 'user', 'example', 'com', 'umbers']
['1:', '_', '123@

# **Exercise: Log Analyzer**

In [37]:
log_entries = [
    "User: john_doe, Email: john@example.com, Action: login, Time: 12:45pm, Zip: 10001, Phone: 212-345-9999",
    "User: alice123, Email: alice@web.org, Action: logout, Time: 9:59 AM, Zip: 11104, Phone: 888-555-1234",
    "User: bob, Email: bob.example.com, Action: error, Time: 6:60am, Zip: 90210, Phone: 123-123-12345",
    "User: clara-k, Email: clara@site.co, Action: login, Time: 6:02 am, Zip: 85258-0001, Phone: 303-123-7300",
    "User: admin, Email: admin@web.net, Action: login, Time: five o'clock, Zip: K1A0A9, Phone: +44 303 123 7300",
]

def valid_username(log_list):
    count = 0
    regex = r"^User: ([a-z]+)"
    for log in log_list:
        result = re.search(regex, log)
        if result:
            count += 1
    return count

def valid_emails(log_list):
    count = 0
    regex = r"Email: (\w+@\w+\.\w+)"
    for log in log_list:
        result = re.search(regex, log)
        if result:
            count += 1
    return count



def valid_login(log_list):
    count = 0
    regex = r"Action: login"
    for log in log_list:
        result = re.search(regex, log)
        if result:
            count += 1
    return count

def valid_time(log_list):
    count = 0
    regex = r"Time: (1[0-2]|0?[0-9]:)([0-5][0-9])\s?[aApP][mM]"
    for log in log_list:
        result = re.search(regex, log)
        if result:
            count += 1
    return count

def valid_zipcode(log_list):
    count = 0
    regex = r"Zip: \d{5}\,"
    for log in log_list:
        result = re.search(regex, log)
        if result:
            count += 1
    return count

def valid_phone(log_list):
    count = 0
    regex = r"Phone: \d{3}-\d{3}-\d{4}"
    for log in log_list:
        result = re.search(regex, log)
        if result:
            count += 1
    return count

print("Valid emails: " + str(valid_emails(log_entries)))
print("Valid usernames: " + str(valid_username(log_entries)))
print("Valid login: " + str(valid_login(log_entries)))
print("Valid time: " + str(valid_time(log_entries)))
print("Valid zipcode: " + str(valid_zipcode(log_entries)))
print("Valid phone: " + str(valid_phone(log_entries)))

Valid emails: 4
Valid usernames: 5
Valid login: 3
Valid time: 2
Valid zipcode: 3
Valid phone: 4
