In [6]:
import pandas as pd
import numpy as np

personal_indicators = ['bank account', 'address', 'id', 'email', 'pin', 'password', 'search history', 'years old', 'name']

# Example DataFramee
privacy_df = pd.DataFrame([{'text': 'My name is Vignesh Balaji and my bank account details are 12345678'},
                           {'text': 'John and Smith went to Washington.'},
                           {'text': 'She just turned 30 years old yesterday.'}]
                                                                                )

def check_privacy(text):
    return any(indicator in text.lower() for indicator in personal_indicators)

def privacy_filter(df):
    return df[~df['text'].apply(check_privacy)]

def check_names(text):
    words = text.split()
    return any((i > 0 and word[0].isupper()) or
               (i < len(words) - 1 and word[0].isupper() and words[i + 1][0].isupper()) for i, word in enumerate(words))

def name_filter(df):
    return df[~df['text'].apply(check_names)]

filtered_df = privacy_filter(privacy_df)
print("Filtered for privacy concerns:\n", filtered_df)

name_filtered_df = name_filter(filtered_df)
print("Further filtered for names:\n", name_filtered_df)


Filtered for privacy concerns:
                                  text
1  John and Smith went to Washington.
Further filtered for names:
 Empty DataFrame
Columns: [text]
Index: []


In [9]:
with open('personal_info.txt', 'r', encoding='latin-1') as file:
    words = file.readlines()

words = [word.strip() for word in words]

for word in words[:20]:
    print(word)

markers_list = ['Ethnicity', 'Race', 'Nation', 'Sex']

Full Name
Email address
Home address
Data of Birth
Ethnicity / Race
Gender
National ID numbers / Social security number
Passport number
Visa permits number
Driverâs license number
Vehicle registration plate number
Disability information
Location information
What you are doing when / status
Events attended
Status
Sexual orientation
Education and employment history
Grades
Salary


In [9]:
import re

def find_personal_info_numbers(text):
    patterns = {
        'SSN': r'\b\d{3}-\d{2}-\d{4}\b',  # U.S. Social Security number
        'Credit Card': r'\b(?:\d{4}[- ]){3}\d{4}\b',  # Credit card number
        'Phone Number': r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b',  #phone number format
        'Generic Account Number': r'\b\d{8,12}\b',  #bank account number
        'Sort Code': r'\b\d{2}[-]\d{2}[-]\d{2}\b', #Bank Sort Code
        'PIN': r'\b \d{4} \b'
    }
    found_items = {}

    for key, pattern in patterns.items():
        matches = re.findall(pattern, text)
        if matches:
            found_items[key] = matches

    return found_items

# Example usage
example_text = "Call me at 123-456-7890, my SSN is 123-45-6789. My credit card number is 1234 5678 9012 3456."\
                "My Sort Code is 12-34-56."
found_numbers = find_personal_info_numbers(example_text)
print(found_numbers)

{'SSN': ['123-45-6789'], 'Credit Card': ['1234 5678 9012 3456'], 'Phone Number': ['123-456-7890'], 'Sort Code': ['12-34-56'], 'PIN': [' 1234 ', ' 9012 ']}
