In [1]:
import re
import pandas as pd
import os

In [4]:
def is_valid_account_number(text):
    # - Look for variations of the word "account" (account, accnt, acct, acc, a/c, etc.)
    # - Followed by optional symbols, and then a 7 or 8 digit account number
    pattern = r'(?i)\b(?:account|accnt|acct|acc|a/c|a\.c\.|a-c)[^\d]*(\d{7,8})\b'

    match = re.search(pattern, text)

    return bool(match)

test_inputs = [
    "Please refer to account number 1234567 for more information.",
    "The acct.12345678 has been processed successfully.",
    "Account 1234567 has been updated.",
    "Your accnt1234567 is now active.",
    "AccNo: 12345678, check for updates.",
    "Please transfer the funds to a/c no. 7654321 immediately.",
    "The user's acc1234567 has been disabled.",
    "For security reasons, we masked acct# 98765432.",
    "Account number: 8765432 was accessed by an unauthorized source.",
    "Your bank accnt no. 1234567 has been verified.",
    "Please contact customer support.",
    "The account number is not available at the moment.",
    "Invalid accnt 12345 number provided.",
    "No account details found for the user.",
    "Account XXXXXXXXXX does not exist.",
    "accnt: 1234567 ",
    "account---12345678!!!",
    "Please check your acct 12345678.",
    "Details for A/C 1234567."
]

for text in test_inputs:
    print(f"Input: {text}")
    print(f"Contains valid account number: {is_valid_account_number(text)}")
    print("-" * 50)


Input: Please refer to account number 1234567 for more information.
Contains valid account number: True
--------------------------------------------------
Input: The acct.12345678 has been processed successfully.
Contains valid account number: True
--------------------------------------------------
Input: Account 1234567 has been updated.
Contains valid account number: True
--------------------------------------------------
Input: Your accnt1234567 is now active.
Contains valid account number: True
--------------------------------------------------
Input: AccNo: 12345678, check for updates.
Contains valid account number: True
--------------------------------------------------
Input: Please transfer the funds to a/c no. 7654321 immediately.
Contains valid account number: True
--------------------------------------------------
Input: The user's acc1234567 has been disabled.
Contains valid account number: True
--------------------------------------------------
Input: For security reasons,

In [10]:
filepath = os.path.join(os.getcwd(), 'extracted_account_numbers_english.csv')
df = pd.read_csv(filepath)

In [15]:
df['account_number_regex'] = df['unmasked_text'].apply(is_valid_account_number)

In [16]:
output_csv_regex = 'account_number_regex_shrieyaa.csv'
df.to_csv(output_csv_regex, index=False)

print(f"Extracted account numbers using regex saved to {output_csv_regex}")


Extracted account numbers using regex saved to account_number_regex_shrieyaa.csv


In [17]:
from google.colab import files
files.download(output_csv_regex)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>