# Prep python environment

In [None]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install -r requirements.txt

In [None]:
import tldextract
import pandas as pd
import tldextract
from tld import get_tld
import country_converter as coco
import os
import sys
import re
from glob import glob
import sqlite3
from datetime import datetime
import unicodedata
import hashlib
import progressbar
import dateutil
import gzip

# Extract tld from string

In [None]:
def helper_extract_iana_tld_suffix_from_email(email):
    extracted = tldextract.extract(email)
    return extracted.suffix.split(".")[-1]

def helper_extract_full_tld_suffix_from_email(email):
    extracted = tldextract.extract(email)
    return extracted.suffix


    

# extract domain from string

In [None]:
def helper_extract_domain_without_suffix_from_string(raw_input):
    extracted= tldextract.extract(raw_input)
    return extracted.domain

def helper_extract_domain_with_suffix_from_string(raw_input):
    extracted= tldextract.extract(raw_input)
    return f"{extracted.domain}.{extracted.suffix}"

# get country info

In [None]:
helper_cc = coco.CountryConverter()


# pandas helper functions

In [None]:
# Split the email address into username and domain
def helper_pandas_split_email(input_series):
    return input_series.str.split('@', expand=True)
    

# Mask password

In [None]:
def mask_password(passwd: str) -> str :
    pwlen = len(passwd)
    if pwlen >= 10 :
        return("{}{}{}".format(passwd[0:2],"*" * (pwlen-4),passwd[-2:]))
    else :
        return("{}{}{}".format(passwd[0],"*" * (pwlen-2),passwd[-1:]))

# NML hash

In [None]:
def normalize_email(email: str) -> str :
    return unicodedata.normalize('NFC',email.lower())
    

In [None]:
unicodedata.is_normalized("NFC",normalize_email(u"\u2126 \u212B \00C5"))


In [None]:
def nml_hash(email: str, passwd: str) -> str:
    return hashlib.sha256(f'{email}{passwd}'.encode()).hexdigest()

In [None]:
nml_hash("emailadres1@domein1.nl","Wacht!woord1") == "d43f9c7a98ce28989acf4b6d5831105e86dc5266570621bef8d24bd26ebec708"

# Database

In [None]:
def create_leak_db(path: str) -> None:
    if os.path.exists(path):
        print("File {} exists, bailing out!".format(path), file=sys.stderr)
        return
    leak_conn = sqlite3.connect(path)
    lcur = leak_conn.cursor()
    lcur.execute("""
        CREATE TABLE entity (
            username TEXT,
            masked_passwd TEXT,
            nml_hash TEXT,
            email_apex TEXT,
            url TEXT,
            url_apex TEXT,
            ts_found TEXT,
            ts_leaked TEXT,
            has_name BOOL,
            has_dob BOOL,
            has_addr BOOL,
            has_phone BOOL,
            has_cc BOOL,
            has_bankacc BOOL,
            has_ssn BOOL,
            has_ip BOOL,
            extra_data TEXT
        );
    """)
    lcur.execute("create index entity_username on entity ( username collate nocase);")
    lcur.execute("create index entity_nml_hash on entity ( nml_hash collate nocase);")
    lcur.execute("create index entity_email_apex on entity ( email_apex collate nocase);")
    lcur.execute("create index entity_url_apex on entity ( url_apex collate nocase);")
    lcur.execute("""
        CREATE TABLE divd (
            nml_hash TEXT,
            case_id TEXT,
            sub_id TEXT,
            description TEXT,
            date TEXT
        );
    """)
    lcur.execute("create index divd_nml_hash on divd ( nml_hash collate nocase);")
    

# Dates

In [None]:
def text2sqlitedate(text: str) -> str:
    date = dateutil.parser.parse(text)
    return date.strftime("%Y-%m-%d %H:%M:%S")


In [None]:
text2sqlitedate("12 Nov 2023 12:01am")

# Email?

In [None]:
def is_email(text: str) -> bool :
    return re.match(r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$', text) is not None

In [None]:
is_email("fbreedijk")

In [None]:
is_email("fbreedijk@schubergphilis.com")