In [123]:
import pandas as pd
import numpy as np
import re

Reading in text file and converting list into set while excluding first row. Also removing leading and trailing whitespaces.

In [124]:
df = pd.read_csv("/Users/mefleh/Desktop/tlds-alpha-by-domain.txt", header=None, names=["Domain"], skiprows=1)
df["Domain"] = df["Domain"].str.lower()
domains = df["Domain"].astype("string")
domains = set(domains)

Defining the valid email pattern using regular expressions

In [125]:
emailPattern = re.compile(r"""
    ^(?:[a-zA-Z0-9!#$%&'*+\-/=?^_`{|}~]+(?:\.[a-zA-Z0-9!#$%&'*+\-/=?^_`{|}~]+)*|"[^"]+")@
    (?:[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?\.)+[a-zA-Z]{2,}$
""", re.VERBOSE)

Defining a function to validate email addresses with specific error messages

In [126]:
def validateEmail(email):
    errors = []

    if email.count("@") != 1:
        errors.append("not valid due to presence of more than one @ symbol")

    if ".." in email or ".@" in email or "@." in email:
        errors.append("not valid due to dot directly before or after @ symbol or due to consecutive dots")

    if email.startswith(".") or email.endswith("."):
        errors.append("not valid due to starting or ending with a dot")

    if email.startswith("@"):
        errors.append("not valid due to starting with @ symbol; this is a Twitter handle")

    validChars = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!#$%&'*+-/=?^_`{|}~.@")
    invalidChars = [char for char in email if char not in validChars]
    if invalidChars:
        errors.append(f"not valid due to presence of invalid characters: {''.join(invalidChars)}")

    if len(errors) > 0:
        return ', '.join(errors)

    localPart, domainPart = email.split("@")
    if not localPart or not domainPart:
        errors.append("not valid due to empty local or domain part")

    tld = domainPart.lower().rsplit(".", 1)[-1]
    if tld not in domains:
        errors.append("not valid due to incorrect top-level domain")

    if len(errors) == 0:
        return "potentially valid"
    else:
        return ', '.join(errors)

Testing each rule violation with different test emails

In [128]:
testEmails = [
    "user@name@example.com",      # violates rule 1
    "username.@example.com",      # violates rule 2
    ".username@example.com",      # violates rule 3
    "@example.com",               # violates rule 4
    "user(name@example.com",      # violates rule 5
    "username@",                  # violates rule 6
    "username@example.abdallah",  # violates rule 7
    "username@example.com"        # no violation
]


for email in testEmails:
    print(f"{email}: {validateEmail(email)}\n")

user@name@example.com: not valid due to presence of more than one @ symbol

username.@example.com: not valid due to dot directly before or after @ symbol or due to consecutive dots

.username@example.com: not valid due to starting or ending with a dot

@example.com: not valid due to starting with @ symbol; this is a Twitter handle

user(name@example.com: not valid due to presence of invalid characters: (

username@: not valid due to empty local or domain part, not valid due to incorrect top-level domain

username@example.abdallah: not valid due to incorrect top-level domain

username@example.com: potentially valid



Testing email address that violates multiple rules

In [129]:
testMultiple = "te@..st.@.example.abdallah"
print(f"{testMultiple}: {validateEmail(testMultiple)}")

te@..st.@.example.abdallah: not valid due to presence of more than one @ symbol, not valid due to dot directly before or after @ symbol or due to consecutive dots


Allow user to input email address to test the function

In [122]:
userEmail = input("Please enter an email address: ")
userEmail = userEmail.lower()
result = validateEmail(userEmail)
#print(result)
print(f"{userEmail}: {result}\n")

Please enter an email address:  matthew.dube@maine.edu


matthew.dube@maine.edu: potentially valid

