In [45]:
import re

# Original data
data = ("555-1239Dr. Bernard Lander(636) 555-0113Hollingdorp, Donnatella555-6542"
        "Fitzgerald, F. Scott555 8904Rev. Martin Luther King636-555-3226Snodgrass, "
        "Theodore5553642Carlamina Scarfoni")

In [46]:
# 1. Extract the numbers
numbers = re.findall(r'\(?\d{3}\)?[- ]?\d{3}[- ]?\d{4}|\d{3}[- ]?\d{4}', data)
print("1. Phone Numbers:")
for number in numbers:
    print(f"  {number}")
print()

1. Phone Numbers:
  555-1239
  (636) 555-0113
  555-6542
  555 8904
  636-555-3226
  5553642



In [47]:
# 2. Extract the names
name_parts = re.split(r'\(?\d{3}\)?[- ]?\d{3}[- ]?\d{4}|\d{3}[- ]?\d{4}', data)
names = [name.strip() for name in name_parts if name.strip()]
print("2. Names:")
for name in names:
    print(f"  {name}")
print()

2. Names:
  Dr. Bernard Lander
  Hollingdorp, Donnatella
  Fitzgerald, F. Scott
  Rev. Martin Luther King
  Snodgrass, Theodore
  Carlamina Scarfoni



In [48]:
# 3. Rearrange the vector so that all elements conform to the standard “firstname lastname”, preserving any titles (e.g., “Rev.”, “Dr.”, etc) or middle/second names.
def rearrange_name(name):
    if ',' in name:
        last, first = name.split(',', 1)
        return f"{first.strip()} {last.strip()}"
    return name

standard = [rearrange_name(name) for name in names]
print("3. Standard Names:")
for name in standard:
    print(f"  {name}")
print()

3. Standard Names:
  Dr. Bernard Lander
  Donnatella Hollingdorp
  F. Scott Fitzgerald
  Rev. Martin Luther King
  Theodore Snodgrass
  Carlamina Scarfoni



In [49]:
# 4. Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.)
has_title = [name.startswith(('Dr.', 'Rev.')) for name in standard]
print("4. Has Title:")
for name, title_flag in zip(standard, has_title):
    print(f"  {name}: {title_flag}")
print()

4. Has Title:
  Dr. Bernard Lander: True
  Donnatella Hollingdorp: False
  F. Scott Fitzgerald: False
  Rev. Martin Luther King: True
  Theodore Snodgrass: False
  Carlamina Scarfoni: False



In [50]:
# 5. Construct a logical vector indicating whether a character has a middle/second name.
def has_middle(name):
    parts = name.split()
    if parts[0] in ['Dr.', 'Rev.']:
        return len(parts) > 3
    return len(parts) > 2

has_middle_name = [has_middle(name) for name in standard]
print("5. Has Middle Name:")
for name, mid_flag in zip(standard, has_middle_name):
    print(f"  {name}: {mid_flag}")
print()

5. Has Middle Name:
  Dr. Bernard Lander: False
  Donnatella Hollingdorp: False
  F. Scott Fitzgerald: True
  Rev. Martin Luther King: True
  Theodore Snodgrass: False
  Carlamina Scarfoni: False



In [51]:
# 6. Consider the HTML string <title>+++BREAKING NEWS+++<title>. We would like to extract the first HTML tag (i.e., “<title>”). To do so we write the regular expression “<.+>”. Explain why this fails and correct the expression.
html = "<title>+++BREAKING NEWS+++<title>"
print("6. HTML Tag Extraction:")
fail_html_match = re.findall(r"<.+>", html)
print(f"  Fail regex result: {fail_html_match}")
correct_html_match = re.findall(r"<[^>]+>", html)
print(f"  Correct regex result: {correct_html_match}")
print()

6. HTML Tag Extraction:
  Fail regex result: ['<title>+++BREAKING NEWS+++<title>']
  Correct regex result: ['<title>', '<title>']



  The given regular expression fails because it looks for a regular expression from the first < tag until the closing > tag, which grabs everything in between.

  Instead we just want the title tag itself, the corrected regex finds the first opening < tag, ignores the characters that are not >, and then finds a >.

The regex expression that fails doesn't correctly get the numbers or math symbols because of the ^ symbol which looks for characters that are not \d=+*() due to the closing ^.

The corrected regex instead includes the math symbols in the search and and the numbers.

In [52]:
# 7. Consider the string “(5-3)^2=5^2-2*5*3+3^2”. We would like to extract the equation in its entirety from the string. To do so we write the regular expression “[^0-9=+*()]+”. Explain why this fails and correct the expression.
expr = "(5-3)^2=5^2-2*5*3+3^2"
print("7. Equation Extraction:")
fail_expr = re.findall(r"[^\d=+*()^]+", expr)
print(f"  Fail regex result: {fail_expr}")
correct_expr = re.findall(r"[-+*/^=()\d]+", expr)
print(f"  Correct regex result: {correct_expr}")


7. Equation Extraction:
  Fail regex result: ['-', '-']
  Correct regex result: ['(5-3)^2=5^2-2*5*3+3^2']
