In [3]:
# Import Regex
import re
import pandas as pd
import os

In [45]:
# Identify different formats of phone number (US)
def is_phone_number(phone):
    patterns = [
        r'^\d{3}-\d{3}-\d{4}$',            # 123-456-7890
        r'^\(\d{3}\)\s\d{3}-\d{4}$',       # (123) 456-7890
        r'^\d{3}\.\d{3}\.\d{4}$',          # 123.456.7890
        r'^\+?1-\d{3}-\d{3}-\d{4}$',       # +1-123-456-7890 or 1-123-456-7890
        r'^\d \(\d{3}\) \d{3}-\d{4}$',     # 1 (123) 456-7890
        r'^\d{10}$',                       # 1234567890
        r'^\d{3}\s\d{3}\s\d{4}$',          # 123 456 7890
        r'^(?:\d\s?){10}$',                # 1 2 3 4 5 6 7 8 9 0
        r'^\(\d{3}\)\s?-\s?\d{3}-\d{4}$',  # (111)-222-3333 

        # Flexible separators: spaces, dashes, or dots
        r'^\+?1[-\s]?(\d{3})[-\s]?(\d{3})[-\s]?(\d{4})$', # +1 123-456.7890
        r'^\(\d{3}\)-\d{6,7}$', # (123)-4567890
        r'^\(\d{2}\)-\d{4}\s\d{4}$', # (01)-2345 6789
        r'^\d{3} \d{3}\.\d{4}$', # 123 456.7890
        r'^\d{3}\.\d{3}-\d{4}$', # 123.456-7890
        r'^\d{3} \d{3}-\d{4}$', # 123 456-7890

        # International numbers with three digits of country code
        r'^\+\d{3}[-\s.]?\d{3}[-\s.]?\d{3}[-\s.]?\d{4}$'
    ]
    
    # Exclude area code in the criteria
    for pattern in patterns:
        if re.match(pattern, phone):
           return True  # The phone number is valid and has a valid area code
    
    return False

In [46]:
# Test 1
test_inputs = [
    # should evaluate to True
    '(217) 456-7890',
    '1 (298) 456-5637',
    '839-456-7890',
    '240.950.3182',
    '+1-217-456-7890',
    '5159820455',
    '(669)-246-2485',
    '(04)-5791 4419', 
    '(342)-2134925',
    
    '+959.049 439-1055', 
    '+480 786 329.4983',
    '+655 115.344-5404',   
    '052 207.9281',
    '021.947-9930',
    '051 870-2115',
    '123 123 456 7899',

    # should evaluate to False
    '+21.13 926 7829', # invalid
    '+41 15-968.9292', # invalid
    '1234-567-890',    # invalid
    '7215.577-0061'    # 7 could be the country code but now we are not accounting for this
]

for text in test_inputs:
    print(f'{text}: {is_phone_number(text)}')

123-456-7890: True
123456-7890: True
(123) 456-7890: True
123.456.7890: True
+1-123-456-7890: True
1-123-456-7890: True
1 (123) 456-7890: True
1234567890: True
123 456 7890: True
(111)-222-3333: True


In [20]:
# Test the algorithm using the dataset
filepath = os.path.join(os.getcwd(), '..', 'data', 'extracted_tel_numbers_english.csv')
df = pd.read_csv(filepath)

In [8]:
df.shape

(2417, 3)

In [9]:
df.head()

Unnamed: 0,unmasked_text,masked_text,phone_number
0,"Mr. Franecki, we have scheduled a learning ass...","[PREFIX_1] [LASTNAME_1], we have scheduled a l...",(612).3785804
1,The palliative care team is coordinating a hom...,The palliative care team is coordinating a hom...,09.48-67 61 36
2,"We've updated your next appointment, Tavares. ...","We've updated your next appointment, [FIRSTNAM...",+19-302 725.8274
3,Lawrence County Sports department is conductin...,[COUNTY_1] Sports department is conducting a p...,+97-259 289.7050
4,"Dear Ivory, Your appointment with Dr. Frami re...","Dear [FIRSTNAME_1], Your appointment with Dr. ...",(842)-6372460


In [10]:
phone_numbers = df['phone_number'].tolist()
phone_numbers

target = []

for text in phone_numbers:
    if is_phone_number(text):
        target.append('TEL')
    else:
        target.append('O')

count_O = target.count('O')
count_TEL = target.count('TEL')

print(f'Number of "O": {count_O}')
print(f'Number of "TEL": {count_TEL}')

Number of "O": 1767
Number of "TEL": 650


In [11]:
df['target'] = target
df.head(10)

Unnamed: 0,unmasked_text,masked_text,phone_number,target
0,"Mr. Franecki, we have scheduled a learning ass...","[PREFIX_1] [LASTNAME_1], we have scheduled a l...",(612).3785804,O
1,The palliative care team is coordinating a hom...,The palliative care team is coordinating a hom...,09.48-67 61 36,O
2,"We've updated your next appointment, Tavares. ...","We've updated your next appointment, [FIRSTNAM...",+19-302 725.8274,O
3,Lawrence County Sports department is conductin...,[COUNTY_1] Sports department is conducting a p...,+97-259 289.7050,O
4,"Dear Ivory, Your appointment with Dr. Frami re...","Dear [FIRSTNAME_1], Your appointment with Dr. ...",(842)-6372460,TEL
5,"Hi Gerardo, here's your confirmation for your ...","Hi [FIRSTNAME_1], here's your confirmation for...",(017)-0128665,TEL
6,For property laws concerning Charley Graham at...,For property laws concerning [FIRSTNAME_1] [LA...,1066-121.1337,O
7,"Dear Mr. Hildegard Marlowe Borer, \nI trust th...",Dear [PREFIX_1] [FIRSTNAME_1] [MIDDLENAME_1] [...,+717-584.257-0232,TEL
8,"Hello Kenny, A follow-up psycho-oncology asses...","Hello [FIRSTNAME_1], A follow-up psycho-oncolo...",04056 81940,TEL
9,"Ms.'s appointment at [61.857,-105.1437] on Mar...",[PREFIX_1]'s appointment at [NEARBYGPSCOORDINA...,023 4733393,TEL


In [12]:
df[['phone_number', 'target']]

Unnamed: 0,phone_number,target
0,(612).3785804,O
1,09.48-67 61 36,O
2,+19-302 725.8274,O
3,+97-259 289.7050,O
4,(842)-6372460,TEL
...,...,...
2412,+389 87-323 9633,O
2413,(638)-9797828,TEL
2414,04443-965009,O
2415,0367 937 6731,O


In [13]:
# Save the updated df to the 'data' folder
filepath = os.path.join(os.getcwd(), '..', 'data', 'extracted_tel_numbers_english_updated.csv')
df.to_csv(filepath)

In [14]:
# Import shrieyaa_mini_df for checking wrong rows
filepath = os.path.join(os.getcwd(), '..', 'data', 'shrieyaa_mini_df.csv')

# Read the CSV file using pandas
mini_df = pd.read_csv(filepath)

mini_df[0:20]

Unnamed: 0,Column1,unmasked_text,masked_text,phone_number,target,manual_target,zero if wrong,false positives,false negatives,Unnamed: 9
0,448.0,Got some exciting findings for individuals wit...,Got some exciting findings for individuals wit...,05196 04711,TEL,TEL,1,,,
1,449.0,"Name Simonis, your cognitive therapy session i...","[FIRSTNAME_1] [LASTNAME_1], your cognitive the...",1063 690.6267,TEL,TEL,1,,,
2,450.0,Medical intervention and occupational therapy ...,Medical intervention and occupational therapy ...,+40-724-062 4764,TEL,TEL,1,,,
3,451.0,Looking for a therapy session before 6 AM. Liv...,Looking for a therapy session before [TIME_1]....,03816-26306,O,O,1,,,
4,452.0,Emergency meeting announced for Accounts assoc...,Emergency meeting announced for [JOBAREA_1] as...,+94.476-548.6102,TEL,TEL,1,,,
5,453.0,I am writing in regards to the advertised inte...,I am writing in regards to the advertised inte...,(04)-5791 4419,O,O,1,,,
6,454.0,Legal notice from Anahi8 claiming breach of co...,Legal notice from [USERNAME_1] claiming breach...,003.765 1989,O,TEL,zero,,yes,
7,455.0,"To the attention of Candace Paucek, based on t...",To the attention of [FIRSTNAME_1] [LASTNAME_1]...,05268 919351,O,O,1,,,
8,456.0,Shipment of medical N4jxl1ympwgZ device delaye...,Shipment of medical [PASSWORD_1] device delaye...,(737) 0128122,O,O,1,,,
9,457.0,Time management isn't just for your Internatio...,Time management isn't just for your [JOBTITLE_...,+77-014 602.3347,TEL,TEL,1,,,
