# Import helper functions

In [None]:
%run ../0.shared_notebooks/0_helper_functions.ipynb

In [None]:
# Load extra libraries
import dateutil 
import json

# Set constants

In [None]:
CASE="DIVD-2021-00004"
SUB="logs.txt"
IN_DIR="../IN"
OUT_DIR="../NORMALIZED"
ERR_DIR="../ERR"

In [None]:
!ls $IN_DIR
!ls $ERR_DIR

## Defaults


In [None]:
defaults = [
  None,  # ts_found (Timestamp when the data was "found")
  None,  # ts_leaked (Timestamp when the data was stolen/leaked)
  0,     # has_name (0/1 if the record has a name)
  0,     # has_dob (0/1 if the record has a date of birth)
  0,     # has_addr (0/1 if the record has a address)
  0,     # has_phone (0/1 if the record has a )
  0,     # has_cc (0/1 if the record has creditcard data)
  0,     # has bankacc (0/1 if the record has a bank account)
  0,     # has_ssn (0/1 if the record has a ssn)
  0,     # has ip (0/1 if the record has an ip address)
  0,     # extra_data (json object with extra data)
]
ts_from_file = False

## Read in files, guess column and type

In [None]:
files=sorted(glob(f"{IN_DIR}/*.txt"))

In [None]:
files

## Record format for normalized files
* username
* passwd
* url (Url that the credentials belong to)
* ts_found (Timestamp when the data was "found")
* ts_leaked (Timestamp when the data was stolen/leaked)
* has_name (0/1 if the record has a name)
* has_dob (0/1 if the record has a date of birth)
* has_addr (0/1 if the record has a address)
* has_phone (0/1 if the record has a )
* has_cc (0/1 if the record has creditcard data)
* has bankacc (0/1 if the record has a bank account)
* has_ssn (0/1 if the record has a ssn)
* has ip (0/1 if the record has an ip address)
* extra_data (json object with extra data)

In [None]:
import datetime

for file in files:
    filename=os.path.basename(file)
    timestamp=os.path.getctime(file)
    record = {}
    records = []
    with open(file, 'rb') as fh:
        for line in fh:
            line=line.decode('utf-8')
            if re.search(r'^\s+$', line) :
                pass # Ignore empty lines
            elif re.search(r'^\+[ \-]+\+$', line) : # End of records
                record['extra_data'] = json.dumps(record['extra_data'])
                records.append(record)
            elif re.search(r'^\+[ \-]+([\w\s\-]+?)\-\-+\+$', line) : # Start of record
                record = {
                    'username'    : "",
                    'password'    : "",
                    "ts_found"    : "2021-04-09 00:00:00",
                    "ts_leaked"   : "",
                    "has_name"    : defaults[2],
                    "has_dob"     : defaults[3],
                    "has_addr"    : defaults[4],
                    "has_phone"   : defaults[5],
                    "has_cc"      : defaults[6],
                    "has_bankacc" : defaults[7],
                    "has_ssn"     : defaults[8],
                    "has_ip"      : defaults[9],
                    "extra_data"  : {},
                }
                match = re.search(r'^\+[ \-]+([\w\s\-]+?)\-\-+\+$', line)
                record['extra_data']['campaign'] = match.group(1)
            elif re.search(r'^\|Information', line) : # Info line
                match = re.search(r'^\|Information: (.*)', line)
                record['extra_data']['information'] = match.group(1)
            elif re.search(r'^\|Email', line) : # Email line
                match = re.search(r'Email \: (.*?) ', line)
                record['username'] = match.group(1)
                match = re.search(r'Password \: (.*?)(\.\||\s)', line)
                record['password'] = match.group(1)
                match = re.search(r'Page \: (.*)', line)
                if match :
                    print(match.group(1))
                    record['extra_data']['phishing_site'] = match.group(1)
                    print(record)
                    bla_site()
            elif re.search(r'^\|IP Address', line) : # Email line
                match = re.search(r'IP Address.*?([\d\.]+)', line)
                record['extra_data']['ip'] = match.group(1)
                record['has_ip'] = 1
            elif re.search(r'^\|(Country|Region)', line) : 
                pass # Not interested
            elif re.search(r'^\|Date', line) : 
                match = re.search(r'Date\: (.*?[ap]m)', line)
                date = dateutil.parser.parse(match.group(1))
                record["ts_leaked"] = date.strftime("%Y-%m-%d %H:%M:%S")
                match = re.search(r'Browser\: (.*)', line)
                if match:
                    record['extra_data']['user-agent'] = match.group(1)
            else:
                print(line)
                bla()
                pass
df = pd.DataFrame(records)    
#df.to_csv(f"{COMBO_DIR}/{filename.replace(".txt",".tsv")}",sep="\t", index=False, columns=["user","pass","telegram_date","file"])

In [None]:
df.head()

In [None]:
df.to_csv(f"{OUT_DIR}/{CASE}-{SUB}.tsv", sep="\t", index=False)