### Lesson 8 of CS50 (RegEx):

In [None]:
# Import the library for regular expressions:
import re
# Set a default string to check with regular expressions:
email = "danielpiede@abcd?com"
# Use the search from re to check the email address:
# Use r for marking as a raw string:
if re.search(r".+@.+\.com", email):
    print("Valid")
else:
    print("Invalid")

In [None]:
# Enhance further, as there are many corner cases not covered yet:

email = "daniel.piede@keio.jp"
# ^ for matching start of the string, $ matching the end of the string:
# Use [] to specify a set of characters, and [^] to exclude a set of characters:
# Use r for marking as a raw string:
if re.search(r"^[a-zA-Z0-9_\.]+@[a-zA-Z0-9_\.]+\.jp$", email):
    print("Valid")
else:
    print("Invalid")

In [None]:
# To make it easier we can use a predifined set of characters:

email = "daniel.piede@keio.jp"

if re.search(r"^[\w.]+@\w+\.jp$", email):
    print("Valid")
else:
    print("Invalid")

In [None]:
# Use flags to make it ignoring the case:

email = "daniel.piede@keio.jp"

if re.search(r"^[\w.]+@\w+\.jp$", email, flags=re.IGNORECASE):
    print("Valid")
else:
    print("Invalid")

In [None]:
# Group things together with () and use ? for 0 or 1 repetition:
emails = [
    "daniel.piede@cs50.keio.jp",
    "danielpiede@keio.jp",
    "danielpiede@keio.",
    "somethingelse.com",
]

for email in emails:

    if re.search(r"^(\w|\.)+@(\w+\.)?\w+\.jp$", email, flags=re.IGNORECASE):
        print("Valid")
    else:
        print("Invalid")

#### As some of the regexes are super cryptic, we can use some libraries!

In [None]:
# Instead of searching for something in we can match or fullmatch:
# Somewhere in the string:
re.search()
# From the start
re.match() 
# All of the string:
re.fullmatch()

In [None]:
# Instead of validate, clean up input (format):

names = ["Daniel Piede", "Airi Takahashi", "Malan, David" "Malen David, Jr"]

for name in names:
    if "," in name:
        last, first = name.split(", ")
        name = f"{first} {last}"
    print(f"Hello, {name}")

# This is not sufficient as seen for case 4 in names. Therefore, we need to leverage the power of regex.

In [8]:
# Using re

import re

check = input("What is the name? ").strip()
#  we can set groups with paranthesis if not referencing explicitly how many times.
matches = re.search(r"^(.+), (.+)$", check)
if matches:
    # Use groups to get them back:
    last, first = matches.groups()
    check = f"{first} {last}"
print(f"Hello, {check}")


Hello, Daniel Piede


In [11]:
# Using re

import re

check = input("What is the name? ").strip()
#  we can set groups with paranthesis if not referencing explicitly how many times.
matches = re.search(r"^(.+), (.+)$", check)
if matches:
    # Use group(n) to get a specific group back:
    last = matches.group(0)
    # check = f"{first} {last}"
print(f"Hello, {last}")

Hello, Piede, Daniel


In [14]:
# Using re

import re

check = input("What is the name? ").strip()
#  we can set groups with paranthesis if not referencing explicitly how many times.
# This time we check if there is one, many or none space after the comma:
# Using the walrus operator to check and assign at the same time:
if matches := re.search(r"^(.+), *(.+)$", check):
    # Use groups to get them back:
    last, first = matches.groups()
    check = f"{first} {last}"
print(f"Hello, {check}")

Hello, Daniel Piede


In [21]:
# Give me a twitter url and give the username back:
import re

urls = ["https://twitter.com/danielpiede", "twitter.com/davidjmalan", "www.twitter.com/imaqtpie"]
pattern = ""
for url in urls:
    username = re.sub(r"^(https?://)?(www\.)?twitter\.com/", "", url)
    print(username)

danielpiede
davidjmalan
imaqtpie


In [59]:
# Solve the same issue as above but with re.search()
import re

urls = [
    "https://twitter.com/danielpiede",
    "twitter.com/davidjmalan",
    "www.twitter.com/imaqtpie",
    "twitter.de/david"
]

top_level = "(com|org|jp|de)"

pattern = ""
for url in urls:
    if matches := re.search(
        rf"^(?:https?://)?(?:www\.)?twitter\.{top_level}/([a-z0-9_]+)$", url, re.IGNORECASE
    ):
        print(matches.group(2))

danielpiede
davidjmalan
imaqtpie
david
