# Extract and normalize contact information using Regex

In [1]:
import pandas as pd
import re

In [2]:
names = ['Axel Magnusson', 'Evelina Eriksson', 'Magnus Eriksson', 'Martin Skog', 'Gustaf', 
         'Klantskalle']
email = ['Axel@gmail.com', 'Evelina@gmail.com', 'Magnus.Eriksson@gmail.com', 'Martin99@gmail.com', 'Gustafgmail.com', 
         'Klantskalle@outlook.com!']
tel = ['070-1234567', '0734-55 66 77', '08 1234567', '(070)9876543!', '1234', 
      '0761234568']

df = pd.DataFrame({
    "name": names,
    "email": email,
    "tel": tel
})

df

Unnamed: 0,name,email,tel
0,Axel Magnusson,Axel@gmail.com,070-1234567
1,Evelina Eriksson,Evelina@gmail.com,0734-55 66 77
2,Magnus Eriksson,Magnus.Eriksson@gmail.com,08 1234567
3,Martin Skog,Martin99@gmail.com,(070)9876543!
4,Gustaf,Gustafgmail.com,1234
5,Klantskalle,Klantskalle@outlook.com!,0761234568


In [3]:
# df.info()

In [4]:
for i, user in df.iterrows():
    text = (
        f"Call me, {user['name']}, "
        f"on my phone number {user['tel']} "
        f"or email me at {user['email']}."
    )
    print(text)

Call me, Axel Magnusson, on my phone number 070-1234567 or email me at Axel@gmail.com.
Call me, Evelina Eriksson, on my phone number 0734-55 66 77 or email me at Evelina@gmail.com.
Call me, Magnus Eriksson, on my phone number 08 1234567 or email me at Magnus.Eriksson@gmail.com.
Call me, Martin Skog, on my phone number (070)9876543! or email me at Martin99@gmail.com.
Call me, Gustaf, on my phone number 1234 or email me at Gustafgmail.com.
Call me, Klantskalle, on my phone number 0761234568 or email me at Klantskalle@outlook.com!.


## Phone number validation

To validate the swedish phone number we have specific criterions to fullfill. The criterion we set in this code is:
- There must be between 9-11 numbers. That means that no special signs or letters should be included. 
- The number must start with 07, 08.
  
Also , we want to write the phone numbers in the form 07x-xxx xx xx or 08 xxx xx xx. This means if we have +46-numbers, we will replace +46 with 0 using the 'sub' function in the module re (regular expression). 

In [5]:
def format_swedish_number(raw):
    digits = re.sub(r'\D', '', raw)   

    # 1. Cellphone numbers (10 letters)
    if re.match(r'07[02369]', digits) and len(digits) == 10:
        return f"{digits[:3]}-{digits[3:6]} {digits[6:8]} {digits[8:]}"

    # 2. Stockholm (08)
    if digits.startswith('08') and len(digits) >= 7:
        rest = digits[2:]
        return f"08 {rest[:3]} {rest[3:5]} {rest[5:]}"

    # 3. Area number 0xxx
    if re.match(r"0\d{3}", digits):
        area = digits[:4]
        rest = digits[4:]
        return f"{area} {rest[:3]} {rest[3:5]} {rest[5:]}"

    # 4. Area number 0xx
    if re.match(r"0\d{2}", digits):
        area = digits[:3]
        rest = digits[3:]
        return f"{area} {rest[:3]} {rest[3:5]} {rest[5:]}"

    return raw   # If the number cannot be formatted

for i, user in df.iterrows():

    text = (
        f"Call me, {user['name']}, "
        f"on my phone number {user['tel']} "
        f"or email me at {user['email']}."
    )

    # 1. Replace +46 with 0
    clean_text = re.sub(r'\+46\s*0?', '0', text)

    # 2. Fins swedish phone numbers (allows numbers,  siffror, spaces, hyphens, parentheses)
    pattern = r'0[\d\s\-()]*'
    matches = re.findall(pattern, clean_text)

    print(f"\n--- {user['name']} ---")
    if matches:
        # select the match with the most numbers
        best_raw = max(matches, key=lambda x: len(re.sub(r'\D','',x)))
        formatted = format_swedish_number(best_raw)
        print("Phone:", formatted)
    else:
        print("Phone: NOT VALID")



--- Axel Magnusson ---
Phone: 070-123 45 67

--- Evelina Eriksson ---
Phone: 073-455 66 77

--- Magnus Eriksson ---
Phone: 08 123 45 67

--- Martin Skog ---
Phone: 070-987 65 43

--- Gustaf ---
Phone: NOT VALID

--- Klantskalle ---
Phone: 076-123 45 68


## Email validation

In [7]:
import re

def format_email_address(raw):
    # 1. Remove special characters at the end
    cleaned = re.sub(r'[^\w]+$', '', raw)

    # 2. Make sure the email has "@" and at least one period after it
    if re.match(r'\S+@\S+\.\S+', cleaned):
        return cleaned
    else:
        return raw


# Run email validation for each user

for i, user in df.iterrows():

    text = (
        f"Call me, {user['name']}, "
        f"on my phone number {user['tel']} "
        f"or email me at {user['email']}."
    )

    print(f"\n--- {user['name']} ---")

    # hitta email i texten
    pattern_email = r'\S+@\S+'
    matches = re.findall(pattern_email, text)

    if matches:
        formatted_email = format_email_address(matches[0])
        print("Email:", formatted_email)
    else:
        print("Email: NOT VALID")



--- Axel Magnusson ---
Email: Axel@gmail.com

--- Evelina Eriksson ---
Email: Evelina@gmail.com

--- Magnus Eriksson ---
Email: Magnus.Eriksson@gmail.com

--- Martin Skog ---
Email: Martin99@gmail.com

--- Gustaf ---
Email: NOT VALID

--- Klantskalle ---
Email: Klantskalle@outlook.com
