# Import helper functions

In [None]:
%run ../0.shared_notebooks/0_helper_functions.ipynb

# Set constants

In [None]:
CASE="DIVD-2024-00037"
SUB="set1"
IN_DIR="../IN"
OUT_DIR="../NORMALIZED"
ERR_DIR="../ERR"

In [None]:
!ls $IN_DIR
!ls $ERR_DIR

## Defaults


In [None]:
defaults = [
  None,  # ts_found (Timestamp when the data was "found")
  None,  # ts_leaked (Timestamp when the data was stolen/leaked)
  0,     # has_name (0/1 if the record has a name)
  0,     # has_dob (0/1 if the record has a date of birth)
  0,     # has_addr (0/1 if the record has a address)
  0,     # has_phone (0/1 if the record has a )
  0,     # has_cc (0/1 if the record has creditcard data)
  0,     # has bankacc (0/1 if the record has a bank account)
  0,     # has_ssn (0/1 if the record has a ssn)
  0,     # has ip (0/1 if the record has an ip address)
  0,     # extra_data (json object with extra data)
]
ts_from_file = False

## Read in files, guess column and type

In [None]:
files=sorted(glob(f"{IN_DIR}/*.txt"))

In [None]:
files

In [None]:
goodlines = 0
badlines = 0
for file in files:
    filename=os.path.basename(file)
    timestamp=os.path.getctime(file)    
    lines = []
    bad_lines = []
    urlfield = [0,0,0]
    types = [0,0]
    error = ""
    with open(file, 'rb') as fh:
        for byteline in fh:
            line = byteline.decode('utf-8', errors="replace").strip()
            ok = True
            if re.search(r'^\S+$', line) :
                ok = True
            else:
                ok = False
                error = "BAD_PATTERN"
            
            if ":" in line:
                fields = line.strip().split(":")
            else: 
                ok = False
                error = "NO_COLON"

            # Quality check
            if ok:
                if re.match(r'^\s',line) :
                    ok = False
                    error = "HAS_SPACES"
                else:
                    ok = True

                if ok:
                    # Fix split urls
                    newfields = []
                    for i, val in enumerate(fields):
                        #if len(newfields) > 0 and newfields[-1] == "http://111.90.130.4": sleep()
                        if (
                            len(newfields) > 0 and 
                            (
                                # http(s)://
                                (
                                    (
                                        newfields[-1].endswith("https") or 
                                        newfields[-1].endswith("http") or 
                                        newfields[-1].endswith("android")
                                    )  and val.startswith("//")   
                                #) or (
                                #   re.search(r'\d$',newfields[-1]) and re.search(r'^\d+[\/$]',val)      
                                ) or (
                                    re.search(r'https?\:\/\/[a-zA-Z0-9.\-]+\.([a-zA-z]{2,4}|\d{1,3})$',newfields[-1]) and re.search(r'^\d+[a-zA-Z0-9.\-\/]*',val)                                      )
                            )
                        ) :
                            newfields[-1] = "{}:{}".format(newfields[-1],val)
                        else:
                            newfields.append(val)
                    fields = newfields

                if ok:
                    if len(fields) < 2 or len(fields) > 3 :
                        ok = False
                        error = "FIELD_COUNT"
                    else:
                        ok = True

                # Figure out type
                if ok:
                    if len(fields) > 2 and fields[0] and fields[1] and fields[2]:
                        types[1] = types[1]+1
                    else:
                        types[0] = types[0]+1


                # add fields to array
                if ok:
                    goodlines = goodlines+1
                    while len(fields) < 3 :
                        fields.append("")
                    fields.append(defaults[0])  # ts_found (Timestamp when the data was "found")
                                                # ts_leaked (Timestamp when the data was stolen/leaked)
                    if ts_from_file :
                        fields.append(datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d"))
                    else:
                        fields.append(defaults[1])  
                    fields.append(defaults[0])  # has_name (0/1 if the record has a name)
                    fields.append(defaults[0])  # has_dob (0/1 if the record has a date of birth)
                    fields.append(defaults[0])  # has_addr (0/1 if the record has a address)
                    fields.append(defaults[0])  # has_phone (0/1 if the record has a )
                    fields.append(defaults[0])  # has_cc (0/1 if the record has creditcard data)
                    fields.append(defaults[0])  # has_bankacc (0/1 if the record has a bank account)
                    fields.append(defaults[0])  # has_ssn (0/1 if the record has a ssn)
                    fields.append(defaults[0])  # has_ip (0/1 if the record has an ip address)
                                                # extra_data (json object with extra data)
                    fields.append('{{ "filename" : "{}" }}'.format(filename))
                    lines.append(fields)
                else:
                    bad_lines.append(f"{error} - {line}\n")
                    badlines = badlines+1
                    
                # Figure out field data type
                if ok:
                    if re.search(r'^(https?|android)\:',fields[0]) : urlfield[0] = urlfield[0] + 1
                    if re.search(r'^(https?|android)\:',fields[2]) : urlfield[2] = urlfield[2] + 1
    
    if len(bad_lines) > 0 :
        with open(f"{ERR_DIR}/{filename}", "w") as bfh:
            bfh.writelines(bad_lines)
    

    # It is a stealer log if we have more then 25% urls
    if (types[0]+types[1]) > 0 and types[1] / (types[0]+types[1]) > 0.25 :
        ftype = "s"
    else:
        ftype = "c"

    if ftype == "s":
        if urlfield[2] > urlfield[0]:
            column_names= ["username","passwd","url"]
        else:
            column_names= ["url","username","passwd"]
    else:
        column_names= ["username","passwd","url"]
    for fn in [ "ts_found", "ts_leaked", "has_name", "has_dob", "has_addr", "has_phone", "has_cc", "hasbankacc", "has_ssn", 
                         "has ip", "extra_data" ] :
        column_names.append(fn)
    
    df = pd.DataFrame(lines, columns=column_names)

    if ftype == "s" :
        df.to_csv(f"{OUT_DIR}/{filename.replace(".txt",".tsv")}",sep="\t", index=False, 
                  columns=[ "username","passwd","url", "ts_found", "ts_leaked", "has_name", "has_dob", "has_addr", "has_phone", 
                            "has_cc", "hasbankacc", "has_ssn", "has ip", "extra_data"]
                 )
    else : 
        # Not handling combos
        pass
        #df.to_csv(f"{COMBO_DIR}/{filename.replace(".txt",".tsv")}",sep="\t", index=False, columns=["user","pass","telegram_date","file"])
    


In [None]:
"{:,} ({:.1f})% good - {:,} bad ({:.1f})%  - {:,} total".format(goodlines, (goodlines/(goodlines+badlines)*100), badlines, (badlines/(goodlines+badlines)*100), (goodlines+badlines))

In [None]:
!ls $OUT_DIR
!ls -l $ERR_DIR