In [1]:
import re
import pandas as pd

# Define the regular expression for email matching
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'

# Define the function to extract emails using a finite state accepter model
def extract_emails(text):
    state = 0
    emails = []
    email = ''
    for i, c in enumerate(text):
        if state == 0:  # Initial state
            if c.isalnum() or c == '_' or c == '%':
                email += c
                state = 1
        elif state == 1:  # Username state
            if c.isalnum() or c == '_' or c == '%' or c == '+':
                email += c
            elif c == '.':
                email += c
                state = 2
            elif c == '@':
                email += c
                state = 3
            else:
                email = ''
                state = 0
        elif state == 2:  # Dot state
            if c.isalnum():
                email += c
                state = 1
            else:
                email = ''
                state = 0
        elif state == 3:  # Domain state
            if c.isalnum() or c == '-':
                email += c
            elif c == '.':
                email += c
                state = 4
            else:
                email = ''
                state = 0
        elif state == 4:  # Top-level domain state
            if c.isalpha():
                email += c
            else:
                email = ''
                state = 0
        if state == 4 and i == len(text) - 1:
            emails.append(email)
    return emails

# Load the CSV file into a DataFrame
df = pd.read_csv('../csv/resume_data.csv')

# Apply the regular expression to the content column
df['regex_emails'] = df['contents'].apply(lambda x: re.findall(email_pattern, x))

# Apply the finite state accepter model to the content column
df['fsa_emails'] = df['contents'].apply(extract_emails)

# Combine the results from both approaches
df['emails'] = df.apply(lambda x: list(set(x['regex_emails'] + x['fsa_emails'])), axis=1)

# Flatten the list of emails and remove duplicates
emails_list = list(set([email for sublist in df['emails'] for email in sublist]))

# Create a new DataFrame with the emails
emails_df = pd.DataFrame({'emails': emails_list})
emails_df

Unnamed: 0,emails
