In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch
from torch_geometric.data import Data
import torch.nn.functional as F
import warnings
import os
warnings.filterwarnings('ignore')
from torch_geometric.loader import NeighborLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
%matplotlib inline

In [None]:
os.chdir("..")
%pwd

In [None]:
from pprint import pprint
import json
import copy

import gensim
from gensim.models import Word2Vec
from multiprocessing import Pool
from itertools import compress
from tqdm import tqdm
import time

import multiprocessing
import random
import xxhash

In [None]:
def load_data(dataset_id):
    f = open(f"content/data/hosts/{dataset_id}")
    content = [line for line in f]
    return content

In [None]:
import re

def find_privacy_leaks(log_list):
    privacy_leaks = []

    patterns = {
        'IP Address': r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b',
        'Email Address': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}\b',
        'Username': r'\buser: (\w+)\b',
        'Password': r'\bpassword=([^&\s]+)\b',
        'Social Security Number': r'\b\d{3}-\d{2}-\d{4}\b',
        'Credit Card Number': r'\b(?:\d[ -]*?){13,16}\b',
        'Phone Number': r'\b(?:\+\d{1,3}[- ]?)?\(?\d{3}\)?[- ]?\d{3}[- ]?\d{4}\b',
        'URL': r'https?://\S+',
        'Full Name': r'\b[A-Z][a-z]* [A-Z][a-z]*\b',
        'Date of Birth': r'\b\d{1,2}/\d{1,2}/\d{4}\b',
        'Driver\'s License': r'\b[A-Z0-9]{1,20}\b',
        'Passport Number': r'\b[A-Z0-9<]{6,9}\b',
        'Bank Account Number': r'\b\d{4,}\b',
        'SSN (Social Security Number)': r'\b\d{3}-\d{2}-\d{4}\b',
        'Medical Record Number': r'\b[A-Z0-9-]{5,}\b',
        'Employee ID': r'\b[A-Z0-9]{4,}\b',
        'Token/Authentication Key': r'\b[A-Za-z0-9_\-]{32,}\b',
        'Sensitive Keyword': r'\b(?:secret|confidential|private)\b',
        'Address': r'\b\d+\s+\w+\s+\w+\b',  
        'City': r'\b[A-Z][a-z]+\b',
        'State': r'\b[A-Z]{2}\b',
        'ZIP Code': r'\b\d{5}(?:-\d{4})?\b',
        'Credit Card Expiry': r'\b\d{1,2}/\d{2}\b',  
        'Personal Identification Number (PIN)': r'\b\d{4}\b',
        'Healthcare Provider Number': r'\b\d{6,10}\b',
        'Social Media Username': r'@\w+',
        'Vehicle Identification Number (VIN)': r'\b[A-HJ-NPR-Z0-9]{17}\b',
        'Tax Identification Number (TIN)': r'\b\d{3}-\d{2}-\d{4}\b',
        'Credit Score': r'\b\d{3}\b',
        'Biometric Data': r'\b(?:fingerprint|retina scan|facial recognition)\b',
        'Credit Card CVV': r'\b\d{3,4}\b',
        'IMEI Number': r'\b\d{15,17}\b',
        'MAC Address': r'\b(?:[0-9A-Fa-f]{2}[:-]){5}(?:[0-9A-Fa-f]{2})\b',
        'Bank Routing Number': r'\b\d{9}\b',
        'Email Subject': r'Subject: .+',
        'Employee Social Media Usage': r'\b(?:facebook|twitter|linkedin|instagram)\b',
        'Medical Condition': r'\b(?:cancer|diabetes|HIV|STD)\b',
        'Employee Performance Review': r'\b(?:excellent|poor|raise|promotion)\b',
        'Software License Key': r'\b[A-Z0-9]{5,}\b',
        'Software Error Message': r'\b(?:error|exception|crash)\b',
        'SSH Private Key': r'-----BEGIN RSA PRIVATE KEY-----',
        'SSH Public Key': r'ssh-(?:dss|rsa|ed25519|ecdsa)\s+[A-Za-z0-9+/=]+\s(?:\S+\s+)?\S+',
        'HTTP Request Headers': r'(?:GET|POST|PUT|DELETE)\s+/.+\s+HTTP/1\.[01]\r\n(?:[A-Za-z-]+:\s[^\r\n]+\r\n)+',
        'Database Connection String': r'db\.connection\.string=(.+)'
    }

    for log_entry in log_list:
        for category, pattern in patterns.items():
            matches = re.findall(pattern, log_entry)
            if matches:
                for match in matches:
                    print(f"Category: {category}, Privacy Leak: {match}")
                    privacy_leaks.append(f"Category: {category}, Privacy Leak: {match}")

    return privacy_leaks

In [None]:
log_entries = load_data("SysClient0052.systemia.com.txt")
leaked_info = find_privacy_leaks(log_entries)

In [None]:
if leaked_info:
    for info in leaked_info:
        print(info)
else:
    print("No privacy leaks found.")