# A file containing all my Regex functions

In [1]:
# function to remove duplicates in string
def remove_duplicates(s):
    '''Function to remove duplicate strings in a list'''
    if s:
        return ','.join(sorted(set(s.split(','))))
    else:
        return ''

In [2]:
# function to remove duplicates in string
def clean_attachment_types(s):
    '''Function to remove strings with certain suffixes from a list'''
    suffixes = ['.png','.jpg','.jpeg','.css']
    if not isinstance(s, list):
        s = [s]
    result = []
    for item in s:
        if not any(suffix in item for suffix in suffixes):
            result.append(item)
    return result if len(result) > 1 else result[0] if result else None

In [3]:
from bs4 import BeautifulSoup, Comment
import pandas as pd

def text_from_html(body) -> str:
    if pd.isnull(body):
        return ""

    def tag_visible(element):
        if element.parent.name in ["style", "script", "head", "title", "meta", "[document]"]:
            return False
        if isinstance(element, Comment):
            return False
        return True

    soup = BeautifulSoup(body, "html.parser")
    texts = soup.findAll(string=True)
    visible_texts = filter(tag_visible, texts)
    return " ".join(t.strip() for t in visible_texts)

# import html2text
# html2text.html2text(text)

In [4]:
import re 
import string

def clean_text_re_links_brackets(text):
    '''Clean text in following ways:
    - remove text in square brackets [] and within {}
    - remove all https links <>
    - remove line breaks ('\n')
    - remove "-----Original Message-----"
    '''
    
    text = re.sub('\[.*?\]', '', text) # clean text from within square brackets 
    text = re.sub('\<.*?\>', '', text) # clean text from inside hyperlinks
    text = re.sub('\{.*?\}', '', text) # clean text from inside curly brackets
    text = text.replace('\n',' ') #re.sub('\{\n}', '', text)   # clean text from line breaks
    #text = re.sub('-----Original Message-----','',text) # not sure if doing this helps
    
    return text

In [5]:
# log if email contains sort code and store it if so
def find_sort_code_in_email(text):
    '''Find whether email contains a sort code with simple RegEx function:
    - extract sort code numbers (only if it matches format of dd-dd-dd)
    '''
    # the \b stands for word boundary. this r
    #srt_account_string = re.compile(r'\b\d{2}-?\d{2}-?\d{2}\b') # this will accept both with and without dashes
    match_string = re.compile(r'\b\d{2}-\d{2}-\d{2}\b') # this will hardcode in the dashes
    match = match_string.search(text)
    if match:
        return True, f'"{match.group()}"'
    else:
        return False, None
    
    # if only want to return a single True/False output:
#     return bool(srt_account_string.search(text))

In [1]:
# write function to find if the text mentions bank account or bank details
# added in whitespace between bank account and bank details
def find_bank_mention_in_email(text):
    '''Find whether email contains certain words which might relate to bank-detail-change emails with simple RegEx function:
    - mentions bank account or bank details
    '''
    if text is None:
        return False
    # the \b stands for word boundary. this r
#     srt_account_string = re.compile(r'bank account|bank details', re.IGNORECASE)
    match_string = re.compile(r'\b(bank|banks|banking)\b', re.IGNORECASE)
    match = match_string.search(text)
    if match:
        return True #, match.group()
    else:
        return False

In [2]:
# write function to find if the text mentions bank account or bank details
# added in whitespace between bank account and bank details
def find_bank_account_mention_in_email(text):
    '''Find whether email contains certain words which might relate to bank-detail-change emails with simple RegEx function:
    - mentions bank account or bank details
    '''
    if text is None:
        return False
    # the \b stands for word boundary. this r
#     srt_account_string = re.compile(r'bank account|bank details', re.IGNORECASE)
    match_string = re.compile(r'\b(?:bank(?:ing)?\saccount|financial\sadministration|(?:bank(?:ing)?|financial)\s(?:detail|information)s?)\b', re.IGNORECASE)
#     match_string = re.compile(r'\b(bank\saccount|bank\sdetails|financial\sadministration|financial\sdetails)\b', re.IGNORECASE)
    match = match_string.search(text)
    if match:
        return True #, match.group()
    else:
        return False

In [3]:
# write function to find if the text mentions change or new or updated
def find_update_words_in_email(text):
    '''Find whether email contains certain words which might relate to bank-detail-change emails with simple RegEx function:
    - mentions change, new or updated
    '''
    if text is None:
        return False
    # the \b stands for word boundary. this r
    match_string = re.compile(r'change|new|update', re.IGNORECASE)
    match = match_string.search(text)
    if match:
        return True #, match.group()
    else:
        return False

In [8]:
# write function to find if the text mentions bank account or bank details
def find_bank_vocabulary_mention_in_email(text):
    '''Find whether email contains certain words which might relate to bank-detail-change emails with simple RegEx function:
    - mentions sort-code, IBAN or swift
    '''
    # the \b stands for word boundary. this r
    match_string = re.compile(r'sort\scode|IBAN|swift|account\snumber|account\scode', re.IGNORECASE)
    match = match_string.search(text)
    if match:
        return True #, match.group()
    else:
        return False

In [9]:
def check_array_lengths_add_Nones(arrays):
    # Get the length of the first array
    the_len = max(len(l) for l in arrays)
    
    # Check if all arrays have the same length
    if not all(len(l) == the_len for l in arrays):
        # Loop through the arrays
        for i in range(len(arrays)):
            # If the array is shorter than the others, append 'None' until it has the same length
            while len(arrays[i]) < the_len:
                arrays[i].append(None)
    
    # Return the modified arrays list
    return arrays

In [10]:
def check_dict_value_lengths(dct):
    # Get the length of the first value
    the_len = len(next(iter(dct.values())))
    
    # Check if all values have the same length
    if not all(len(val) == the_len for val in dct.values()):
        # Loop through the values
        for key, val in dct.items():
            # If the value is shorter than the others, print its name
            if len(val) < the_len:
                print(f"{key} has a shorter length than the others.")

In [11]:
# def extract_text_between_start_end(start_str, end_str, text):
#     '''Function to take in text, extract the text between the start and 
#     end substring, and then return the text with the inbetween text removed.'''

#     start_index = text.find(start_str) + len(start_str)
#     end_index = text.find(end_str)
#     between_text = text[start_index:end_index]
    
#     no_space_new_text = start_str.replace(" ","")
#     new_text = text.replace(start_str,'').replace(no_space_new_text,'')
#     new_text = text.replace(between_text,'')
# #     new_text = re.sub(between_text,'',text)
    
#     return between_text, new_text

In [12]:
def extract_text_between_start_end(start_str, end_str_1, end_str_2, text):
    '''Function to take in text, extract the text between the start substring and two possible end substrings,
    and then return the text with the in-between text removed.'''

    start_index = text.find(start_str) + len(start_str)
    end_index_1 = text.find(end_str_1)
    if end_str_2 is None:
        end_index_2 = -1
    else:
        end_index_2 = text.find(end_str_2)

    if end_index_1 == -1 and end_index_2 == -1:
        return "", text
    
    elif end_index_1 == -1:
        between_text = text[start_index:end_index_2]
        
    elif end_index_2 == -1:
        between_text = text[start_index:end_index_1]
        
    else:
        end_index = min(end_index_1, end_index_2)
        between_text = text[start_index:end_index]
            
    no_space_new_text = start_str.replace(" ","")
    new_text = text.replace(start_str,'').replace(no_space_new_text,'')
    new_text = text.replace(between_text,'')
    return between_text, new_text

In [13]:
# write function to find if the text mentions bank account or bank details
def find_statement_mention_in_email(text):
    '''Find whether email mentions statement
    '''
    match_string = re.compile(r'\bstatement\b', re.IGNORECASE)
    match = match_string.search(text)
    if match:
        return True #, match.group()
    else:
        return False

In [14]:
# write function to find if the text mentions bank account or bank details
def find_invoice_mention_in_email(text):
    '''Find whether email mentions invoice
    '''
    match_string = re.compile(r'\binvoice\b', re.IGNORECASE)
    match = match_string.search(text)
    if match:
        return True #, match.group()
    else:
        return False

In [15]:
# write function to find if the text mentions cybercrime or fraud warnings or alerts
def find_cybercrime_fraud_mention_in_email(text):
    '''Find whether email refers to fraud or cybercrime warnings or alerts
    '''
    match_string = re.compile(r'\b(fraud|cybercrime)\b', re.IGNORECASE)
    match = match_string.search(text)
    if match:
        return True #, match.group()
    else:
        return False

In [16]:
# write function to find if the text mentions bank account or bank details
def find_attention_external_mention_in_email(text):
    '''Find whether email has an "Attention:" flag in body
    '''
#     match_string = re.compile(r'\b(Attention:|external\ssender)\b', re.IGNORECASE)
    match_string = re.compile(r'Attention:|external\ssender', re.IGNORECASE)
    match = match_string.search(text)
    if match:
        return True #, match.group()
    else:
        return False

In [17]:
# write function to find if the text mentions bank account or bank details
def find_alert_attach_mention_in_email(text):
    '''Find whether email has an "Alert:" + "opening" + "attachments" flag in body
    '''
    match_string = re.compile(r'\bAlert:.*?\bopening\b.*?\battachments\b', re.IGNORECASE)
    match = match_string.search(text)
    if match:
        return True #, match.group()
    else:
        return False

In [18]:
# extract domain from email address
def extract_domain_from_address(text):
    '''Extract domain from email address
    '''
    if text:
        match = text.split('@')
        if match:
            return match[1]
        else:
            return None
    else:
        return None

In [1]:
def clean_websites(text):
    '''Clean text in following ways:
    - remove all https or www links <>
    - remove emails
    - remove phone numbers
    '''
    
#     url_pattern = r'(https?:\/\/\S+?\.(com|co\.uk|org|co|uk))|(www\.\S+?\.(com|co\.uk|org|co|uk))'
    url_pattern = r'(?:https?://|www\.)\S*' #r'(?:https?://|www\.)\S+?'
    email_pattern = r'\b\w+@\w+\.\w+\b'
    tel_pattern = r'(?:Tel|T|M|D):[\s\d\-\+\(\)]*'
    phone_pattern = r'(?:\+|00)?(?:\d{1,3}[\s-]?)?(?:\(\d{1,5}\)|\d{1,5})[\s-]?\d{3,4}[\s-]?\d{3,4}'
#     phone_pattern = r'(?:(?:\+|00)\d{1,3}[\s-]?\d{1,4}[\s-]?)?\(?0?\d{1,5}\)?[\s-]?\d{3,4}[\s-]?\d{3,4}'  # Matches phone numbers in various formats
    
    text = re.sub(url_pattern, '', text) # clean websites and replace 
    text = re.sub(email_pattern, '', text)
    text = re.sub(tel_pattern, '', text)
    text = re.sub(phone_pattern, '', text)

    return text

In [22]:
# clean greetings, endings etc from emails
def clean_fluff(text):
    '''Clean text in following ways:
    - remove greetings & endings
    '''
    
    greetings_pattern = r"^\s*(?:hi|hello|hey|dear|good (?:morning|afternoon|evening)|greetings|salutations|(?:warm )?(?:regards|wishes|thanks)|best(?: regards| wishes)?|kind regards|yours (?:truly|sincerely))[,.\s]*\s*|\s*(?:thanks|thank you|many thanks|regards|best regards|best|best wishes|yours (?:truly|sincerely)|(?:with )?appreciation)[,.\s]*\s*"
    text = re.sub(greetings_pattern, '', text, flags=re.IGNORECASE) # clean greetings and replace
    
    return text