In [None]:
import pandas as pd
import numpy as np

from hashlib import sha256
import base64

import yaml
import re

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, use_memory_fs=False)

In [None]:
# TODO: Connect to SQL DATABASE!

# Functions

Functions used in the remainder of this notebook.

## Match Functions

Functions to match data using signature file. Required to (i) rematch matches from docker-analyzer and (ii) match on environment variables set in Dockerfiles.

In [None]:
with open("../docker-analyzer/signatures/signatures.yaml") as yaml_fd:
  regexes = yaml.safe_load(yaml_fd)

# this are multigroup regexes where the last group matches the actual secret
multigroupRegexes = [
  "trufflehog_azure_oauth_client",
  "trufflehog_azure_oauth_tenant",
  "trufflehog_azure_old",
  "trufflehog_heroku",
  "trufflehog_digitaloceantoken",
  "trufflehog_ibmclouduserkey",
  "trufflehog_gitlab",
  "trufflehog_currencycloud",
  "trufflehog_openuv",
  "trufflehog_netlify",
  "trufflehog_coinbase",
  "trufflehog_bitfenix",
  "trufflehog_accuweather",
  "trufflehog_wepay",
  "trufflehog_tomtom",
  "trufflehog_ticketmaster",
  "trufflehog_paymongo",
  "trufflehog_paymoapp",
  "trufflehog_paydirtapp",
  "trufflehog_loginradius"
]

compiled_regexes = {}
for t in regexes:
  if t == "hyperscan":
    for ns in regexes[t]:
      for r in regexes[t][ns]:
        name = '{}_{}'.format(ns, r)
        regex = str.strip(regexes[t][ns][r])
        compiled_regexes[name] = {}
        compiled_regexes[name]['full'] = re.compile(regex)
        if name in multigroupRegexes:
          regexparts = re.match("\([^\(]*\)({.+})*(?P<name>\([^\(]*\))\([^\(]*\)({[^\{]+})*(?P<secret>.*)", regex)
          compiled_regexes[name]['name'] = re.compile(regexparts.group('name'))
          compiled_regexes[name]['secret'] = re.compile(regexparts.group('secret'))

def match(input, rule='*', retype='full', last=False):
  regexes = {}
  results = {}

  if rule == '*':
    regexes = compiled_regexes
  else:
    regexes[rule] = compiled_regexes[rule]

  for r in regexes:
    if retype in regexes[r]:
      regex = regexes[r][retype]
    else:
      if retype == "secret":
        regex = regexes[r]["full"]
      else:
        continue

    try:
      m = regex.search(input)
      if m:
        if last and rule in multigroupRegexes:
          results[r] = m.groups()[-1]
        else:
          results[r] = m.group(0)
    except Exception as e:
      print("Error matching {}: {}".format(input, e))
        
  return results

In [None]:
def removesuffix(string, suffix):
  if string.endswith(suffix):
    return string[len(string)-len(suffix):]

def rematch(row):
  try:
    d = row['data'].decode(errors="ignore")
  except Exception as e:
    d = str(row['data'])
  
  res = match(d, rule=row['rule'], last=True)
  if row['rule'] in res:
    return res[row['rule']]
  else:
    print("Match could not be validated: {}".format(d))
    return ""

def getPrePart(row):
  try:
    return removesuffix(row['data'].decode(), row['secret'])
  except Exception as e:
    return removesuffix(str(row['data']), row['secret'])

In [None]:
from math import e, log

# Calculate the Entropy of a string
# Taken from https://gist.github.com/virtadpt/a129f94e47c113f983a1ee361f837eb8
def stringEntropy(labels, base=None):
  """ Computes entropy of label distribution. """
  if labels == np.NaN:
    return 0
  
  l = list(labels)

  n_labels = len(l)

  if n_labels <= 1:
    return 0

  value,counts = np.unique(l, return_counts=True)
  probs = counts / n_labels
  n_classes = np.count_nonzero(probs)

  if n_classes <= 1:
    return 0

  ent = 0.

  # Compute entropy
  base = e if base is None else base
  for i in probs:
    ent -= i * log(i, base)

  return ent

## Parsing functions

Functions to parse matches, e.g., private keys.

In [None]:
import json
def tryParseJSON(row):
    try:
        return json.loads(row['data'].decode(errors="ignore"))
    except Exception as e:
        return {}

# We consider the private key in the Google Cloud Secret as actual secret
def gcpGetSecret(row):
    if row["parsed"] and "private_key" in row["parsed"]:
        return row["parsed"]["private_key"]
    else:
        return ""

In [None]:
from xml.dom import minidom
from Crypto.Util import number
from Crypto.Util.asn1 import DerSequence
from Crypto.PublicKey import RSA, DSA
from base64 import standard_b64encode, b64decode

def GetLong(nodelist):
   rc = []
   for node in nodelist:
      if node.nodeType == node.TEXT_NODE:
         rc.append(node.data)
   string = ''.join(rc) 
   return number.bytes_to_long(b64decode(string))

def xmlGetChildNodesIfOccurs(xml, key, id):
    try:
        tmp = xml.getElementsByTagName(key)
    except Exception as e:
        return None
        
    if len(tmp) > id:
        return tmp[id].childNodes
    
    return None

regexes = ["comsys_xmlrsakey", "comsys_xmldsakey", "comsys_xmleckey"]
cregexes = [compiled_regexes[r]['full'] for r in regexes]
def privKeyPEM(xmlPrivateKey):
    result = []
    for c in cregexes:
        for x in c.finditer(xmlPrivateKey):
            try:
                xmlParsed = minidom.parseString(x[0])
                if xmlParsed:
                    if "RSA" in xmlPrivateKey:
                        params = []
                        modulus = GetLong(xmlGetChildNodesIfOccurs(xmlParsed, 'Modulus', 0))
                        exponent = GetLong(xmlGetChildNodesIfOccurs(xmlParsed, 'Exponent', 0))
                        if modulus == None or exponent == None:
                            continue
                        params.extend([modulus, exponent])

                        d = GetLong(xmlGetChildNodesIfOccurs(xmlParsed, 'D', 0))
                        if d != None:
                            params.append(d)
                            p = GetLong(xmlGetChildNodesIfOccurs(xmlParsed, 'P', 0))
                            q = GetLong(xmlGetChildNodesIfOccurs(xmlParsed, 'Q', 0))
                            if p != None and q != None:
                                params.extend([q, p])

                        qInv = GetLong(xmlGetChildNodesIfOccurs(xmlParsed, 'InverseQ', 0))
                        if qInv != None:
                            params.append(qInv)
                        privateKey = RSA.construct(tuple(params))

                        pem = privateKey.exportKey().decode(encoding="ascii")

                        fingerprint = sha256(privateKey.export_key(format="DER", pkcs=1)).hexdigest()
                        corrFingerprint = sha256(privateKey.public_key().export_key(format="DER")).hexdigest()

                        result.append((pem, fingerprint, corrFingerprint))
                    elif "DSA" in xmlPrivateKey:
                        y = GetLong(xmlGetChildNodesIfOccurs(xmlParsed, 'Y', 0))
                        g = GetLong(xmlGetChildNodesIfOccurs(xmlParsed, 'G', 0))
                        p = GetLong(xmlGetChildNodesIfOccurs(xmlParsed, 'P', 0))
                        q = GetLong(xmlGetChildNodesIfOccurs(xmlParsed, 'Q', 0))
                        if q != None and y != None and g != None and p != None:
                            x = GetLong(xmlGetChildNodesIfOccurs(xmlParsed, 'X', 0))
                            if x != None:
                                params = (y, g, p, q, x)
                            else:
                                params = (y, g, p, q)
                            privateKey = DSA.construct(params)

                            pem = privateKey.exportKey().decode(encoding="ascii")
                            fingerprint = sha256(privateKey.export_key(format="DER", pkcs8=False)).hexdigest()
                            corrFingerprint = sha256(privateKey.public_key().export_key(format="DER")).hexdigest()

                            result.append((pem, fingerprint, corrFingerprint))
                        else:
                            continue
            except Exception as e:
                print(e)
                pass
    if len(result) == 0:
        result.append(("", "", ""))
    else:
        print(result)
    return result

## Filter Functions

Functions used to flag matches according to specific filters. Flags are later used to assess the validity of matches.

### Filters for environment variables

In [None]:
def checkMultiGroupMatches(row):
    checked = []
    if len(row['matches_val']) > 0:
        valMatchrules = set(row['matches_val'].keys())
        varMatchrules = set(row['matches_var'].keys())
        secret = row['env_val']
        pre = row['env_var']
        for r in valMatchrules:
            secret = row['matches_val'][r]
            if r in multigroupRegexes:
                if r in valMatchrules.intersection(varMatchrules):
                    pre = row['matches_var'][r]
                    checked.append((r, pre, secret))
            else:
                checked.append((r, "", secret))
    return checked

secretKeywords = ['password', 'key', 'secret']
def valPositiveKeywordFilter(row):
    for k in secretKeywords:
        if k in str.lower(row['env_val']):
            return "varNameKeyword_" + k
    return ""

valPositiveFilters = [valPositiveKeywordFilter]
def applyValPositiveFilters(row):
    results = []
    for f in valPositiveFilters:
        res = f(row)
        if res != "":
            results.append(res)
    return results

# Get Group Information

Get information of which secret belongs to which group, e.g., Google Cloud Plattform belongs to cloud.

In [None]:
result_groups = %sql SELECT name, group FROM matchrule_group mg
df_groups = result_groups.DataFrame().set_index("name")

# Get matches from images

## API secret matches

In [None]:
%%sql result_api_matches_images <<
SELECT DISTINCT
    m.rule as rule,
    m.registry as registry,
    m.repository as repository,
    m.layer as layer,
    m.match_sha256 as match_sha256,
    m.file_name as file_name,
    data
FROM matches m
WHERE m.rule IN (
    SELECT name FROM matchrule_group mg WHERE `group` ILIKE '%api%'
)

In [None]:
df_api_matches_images = result_api_matches_images.DataFrame()
if len(df_api_matches_images) > 0:
    df_api_matches_images['rrl'] = df_api_matches_images[["registry", "repository", "layer"]].apply(tuple, axis=1)
    df_api_matches_images = df_api_matches_images[["rule", "match_sha256", "data", "file_name", "rrl"]].groupby(["rule", "match_sha256", "data"]).agg(list).reset_index()
    df_api_matches_images['data'] = df_api_matches_images.parallel_apply(lambda row: base64.b64decode(row["data"]), axis = 1)
    df_api_matches_images["negativeFilter"] = np.empty((len(df_api_matches_images), 0)).tolist()

    trufflehog_gcp_mask = df_api_matches_images["rule"] == "trufflehog_gcp"

    df_api_matches_images.loc[trufflehog_gcp_mask, 'parsed'] = df_api_matches_images[trufflehog_gcp_mask].parallel_apply(tryParseJSON, axis=1)
    df_api_matches_images.loc[trufflehog_gcp_mask, 'secret'] = df_api_matches_images[trufflehog_gcp_mask].parallel_apply(gcpGetSecret, axis=1)
    df_api_matches_images.loc[trufflehog_gcp_mask, 'negativeFilter'] += df_api_matches_images[trufflehog_gcp_mask].parallel_apply(lambda row: ["unparsable"] if len(row['parsed'])==0 else [], axis=1)

    df_api_matches_images.loc[~trufflehog_gcp_mask, 'secret'] = df_api_matches_images[~trufflehog_gcp_mask].parallel_apply(rematch, axis = 1)
    df_api_matches_images.loc[~trufflehog_gcp_mask, 'prePart'] = df_api_matches_images[~trufflehog_gcp_mask].parallel_apply(getPrePart, axis = 1)

    df_api_matches_images['secret_entropy'] = df_api_matches_images.parallel_apply(lambda row: stringEntropy(list(row["secret"])), axis = 1)
    df_api_matches_images['secret_sha256'] = df_api_matches_images.parallel_apply(lambda row: sha256(row["secret"].encode()).hexdigest(), axis = 1)

## Private keys

In [None]:
%%sql result_pk_matches_images <<
SELECT * FROM (	
	SELECT DISTINCT registry, repository, layer, match_sha256, file_name, rule FROM matches m
	WHERE rule ILIKE '%private%'
) as m
LEFT OUTER JOIN (
	SELECT DISTINCT fingerprint as secret_sha256, corrFingerprint, match_sha256 FROM match_findings WHERE type ILIKE '%PRIVATE%'
) as mf
ON m.match_sha256 == mf.match_sha256

In [None]:
df_pk_matches_images = result_pk_matches_images.DataFrame()

In [None]:
%%sql result_xml_pk_matches_images <<
SELECT DISTINCT registry, repository, layer, match_sha256, data, file_name, rule
FROM matches m

In [None]:
df_xml_pk_matches_images = result_xml_pk_matches_images.DataFrame()
df_xml_pk_matches_images["parsed"] = df_xml_pk_matches_images.apply(lambda row: privKeyPEM(base64.b64decode(row["data"]).decode(errors="ignore")), axis=1)
df_xml_pk_matches_images = df_xml_pk_matches_images.explode("parsed").reset_index(drop=True)
df_xml_pk_matches_images[['parsed', 'secret_sha256', 'corrFingerprint']] = pd.DataFrame(df_xml_pk_matches_images['parsed'].tolist(), index=df_xml_pk_matches_images.index)
add_latex_variable('pkxmlnummatches', df_xml_pk_matches_images[df_xml_pk_matches_images["secret_sha256"] != ""]["secret_sha256"].nunique())

In [None]:
df_pk_matches_images = df_pk_matches_images.append(df_xml_pk_matches_images)

In [None]:
df_pk_matches_images['rrl'] = df_pk_matches_images[["registry", "repository", "layer"]].parallel_apply(tuple, axis=1)
df_pk_matches_images = df_pk_matches_images[["rule", "match_sha256", "file_name", "rrl", "secret_sha256", "corrFingerprint"]].groupby(["rule", "match_sha256", "secret_sha256", "corrFingerprint"]).agg(list).reset_index()

In [None]:
df_matches_images = df_api_matches_images.append(df_pk_matches_images, ignore_index=True).reset_index(drop=True)

In [None]:
df_matches_images["origin"] = "image"

# Get matches from environment variables set in Dockerfiles

Also apply filters (see above) to flag matches.

In [None]:
%%sql result_env_val <<
SELECT
    registry,
    repository,
    env_var,
    env_val
FROM imageconfigs_envval ice
WHERE registry

In [None]:
df_matches_val = result_env_val.DataFrame()
df_matches_val.to_pickle(tmp_path_matches_val)
df_matches_val_only = df_matches_val[["env_val", "env_var"]].drop_duplicates().reset_index(drop=True)

df_matches_val_only['matches_var'] = df_matches_val_only.parallel_apply(lambda row: match(row['env_var'], retype="name"), axis=1)
df_matches_val_only['matches_val'] = df_matches_val_only.parallel_apply(lambda row: match(row['env_val'], retype="secret"), axis=1)
df_matches_val_only['checkedMatches'] = df_matches_val_only.parallel_apply(checkMultiGroupMatches, axis=1)
df_matches_val_only = df_matches_val_only.explode('checkedMatches')

df_matches_val_only = df_matches_val_only[(~df_matches_val_only["checkedMatches"].isnull() & df_matches_val_only["checkedMatches"])]
df_matches_val_only[['rule', 'prePart', 'secret']] = pd.DataFrame(df_matches_val_only['checkedMatches'].tolist(), index=df_matches_val_only.index)
df_matches_val_only = df_matches_val_only.explode('rule').reset_index(drop=True)
df_matches_val_only = df_matches_val_only.drop(df_matches_val_only[df_matches_val_only.rule == 'howbadcanitgit_Gmail'].index)

df_matches_val_only['secret_entropy'] = df_matches_val_only.parallel_apply(lambda row: stringEntropy(str(row['secret'])), axis=1)
df_matches_val_only['secret_sha256'] = df_matches_val_only.parallel_apply(lambda row: sha256(str(row["secret"]).encode()).hexdigest(), axis = 1)

df_matches_val = pd.merge(df_matches_val, df_matches_val_only,  how='right', on=["env_val", "env_var"])
df_matches_val['match_sha256'] = df_matches_val.parallel_apply(lambda row: sha256(row["env_val"].encode()).hexdigest(), axis = 1)

In [None]:
df_num_matches_val = df_matches_val[["match_sha256", "rule"]]
df_num_matches_val = df_num_matches_val.join(df_groups, on="rule")
df_num_matches_val = df_num_matches_val[["match_sha256", "group"]].rename(columns={"match_sha256": "num_matches"})
df_num_matches_val["num_distinct_matches"] = df_num_matches_val["num_matches"]
df_num_matches_val = df_num_matches_val.groupby("group").agg({"num_matches": "count", "num_distinct_matches": "nunique"})

df_matches_val['layer'] = "none"
df_matches_val['rrl'] = df_matches_val[["registry", "repository", "layer"]].apply(tuple, axis=1)

In [None]:
df_matches_val = df_matches_val[["rrl", "env_val", "env_var", "rule", "match_sha256", "secret_sha256", "secret"]].groupby(["rule", "env_val", "env_var", "match_sha256", "secret", "secret_sha256"]).agg(list).reset_index()
df_matches_val["origin"] = "val"

# Combine dataframes with matches from files and environment variables

In [None]:
df_matches = pd.concat([df_matches_val, df_matches_images], ignore_index=True).reset_index(drop=True)

In [None]:
df_matches = df_matches.join(df_groups, on="rule")

# Apply filter to API secrets

### Parameters to generate ngrams

In [None]:
# Generate (4,7)-character ngrams
ngramMin = 4
ngramMax = 7

# Later filter out matches containing ngrams that occur 29 times more often than the mean over all ngrams
frequencyNgramsTimeFactor = 29

### Generate ngrams

In [None]:
# Fix parts in secrets
from nltk import everygrams
from itertools import chain

fixPartsToExclude = ["----- BEGIN PRIVATE KEY -----", "----- END PRIVATE KEY -----", "EAACEdEose0cBA", "AIza", ".apps.googleusercontent.com", "sk_live_", "rk_live_", "sq0atp-", "sq0csp-", "access_token$production$", "amzn.mws.", "key-", "AKIA", "auth_provider_x509_cert_url", "glpat", "ghp", "gho", "ghu", "ghs", "ghr", "LTAI", "aio", "https", ".webhook.office.com/webhookb2", "IncomingWebhook"]
fixPartsToExcludeNgrams = []
for fp in fixPartsToExclude:
    fixPartsToExcludeNgrams.extend([''.join(x) for x in list(everygrams(fp, min_len=4, max_len=7))])

In [None]:
df_matches_test = df_matches[~df_matches["secret"].isnull()].copy()
df_matches_test["ngrams"] = df_matches_test.swifter.apply(lambda row: everygrams(row["secret"], min_len=ngramMin, max_len=ngramMax), axis=1)
grams = pd.Series(list(chain.from_iterable(df_matches_test[~df_matches_test["rule"].str.contains("private")]['ngrams'])), name="occurrences")
frequency = grams.swifter.apply(''.join).value_counts().to_frame()

In [None]:
frequencyExcluded = frequency[~frequency.index.isin(fixPartsToExcludeNgrams)]
frequencyExcluded = frequencyExcluded[~((frequencyExcluded.index.str.count('\d')) >= (frequencyExcluded.index.str.len()/2))]
mostFrequentNgrams = frequencyExcluded[frequencyExcluded["occurrences"] > frequencyNgramsTimeFactor*frequencyExcluded["occurrences"].mean()]

### Apply further filtering

#### Filter out matches containing specific keywords (and most frequent ngrams)

In [None]:
keywordsSecret = mostFrequentNgrams.index.to_list()
keywordsPrePart = ["sha256", "sha512"]
exceptionsSecret = {}
exceptionsPrePart = {
    "trufflehog_heroku": ["*"]
}
def filterKeyWords(row):
    for k in keywordsSecret:
        if row["rule"] in exceptionsSecret:
            if k in exceptionsSecret[row["rule"]] or "*" in exceptionsSecret[row["rule"]]:
                continue
        if k in str.lower(str(row["secret"])):
            return "keywordsecret_" + k

    for k in keywordsPrePart:
        if row["rule"] in exceptionsPrePart:
            if k in exceptionsPrePart[row["rule"]] or "*" in exceptionsPrePart[row["rule"]]:
                continue
        if k in str.lower(str(row["prePart"])):
            return "keywordPrePart_" + k
    return ""


excludedCharsFromSequenceRule = {
    "trufflehog_gcp": ["-"]
}

#### Filter sequences

##### Parameters

In [None]:
exceptionsSequence = [" "]
sequenceLimitNotSame = 4
sequenceLimitSame = 3

##### Filter

In [None]:
def filterSequences(row):
    string = str(row["secret"])

    exclude = exceptionsSequence
    if row["rule"] in excludedCharsFromSequenceRule:
        exclude.extend(excludedCharsFromSequenceRule[row["rule"]])

    excludeOrd = [ord(c) for c in exclude]

    last = -1
    num_sequence_asc = 0
    num_sequence_desc = 0
    num_sequence_same = 0
    for letter in string:
        cur = ord(letter)
        if cur not in excludeOrd:
            num_sequence_same = num_sequence_same + 1 if last == cur else 0
            num_sequence_desc = num_sequence_desc + 1 if last-1 == cur else 0
            num_sequence_asc = num_sequence_asc + 1 if last+1 == cur else 0
            last = cur

            if num_sequence_same >= sequenceLimitSame:
                return "sequence_same"
            elif num_sequence_desc >= sequenceLimitNotSame:
                return "sequence_desc"
            elif num_sequence_asc >= sequenceLimitNotSame:
                return "sequence_asc"
        else:
            last = -1
            num_sequence_asc = 0
            num_sequence_desc = 0
            num_sequence_same = 0
    else:
        return ""

### Combined filter

In [None]:
filters = [filterKeyWords, filterSequences]
def filter(row):
    results = []
    for f in filters:
        result = f(row)
        if result != "":
            results.append(result)
    return results

In [None]:
def setNaToEmptyList(df, column):
    rows = np.where(pd.isnull(df[column]))
    
    for r in rows[0]:
        df.loc[r, column] = [[]]

df_matches["file_name"] = df_matches["file_name"].swifter.apply(lambda d: d if isinstance(d, list) else [])
df_matches["negativeFilter"] = df_matches["negativeFilter"].swifter.apply(lambda d: d if isinstance(d, list) else [])

In [None]:
pkMask = (df_matches["group"] == "private_key")
df_matches.loc[~pkMask, 'negativeFilter'] += df_matches[~pkMask].parallel_apply(filter, axis = 1)
df_matches.loc[pkMask, "negativeFilter"] += df_matches[pkMask].parallel_apply(lambda row: [] if row["secret_sha256"] != "" else ["unparsable"], axis=1)

# Filter private keys

## Filtering based on kompromat (https://github.com/SecurityFail/kompromat)

In [None]:
pkMask = (df_matches["group"] == "private_key")
df_matches_pk_knownkompromat = pd.DataFrame()

validMatches = df_matches[((pkMask) & (df_matches["negativeFilter"].map(len) == 0) & (~df_matches["secret_sha256"].isnull()))]["secret_sha256"].tolist()
n = 3000
for i in range(0, len(validMatches), n):
    curValMatchesString = '\'' + '\',\''.join(validMatches[i:i + n]) + '\''
    result_valid_matches = %sql SELECT metaId, infos, fingerprint as secret_sha256 FROM kompromat k WHERE k.fingerprint IN ({curValMatchesString})
    df_matches_pk_knownkompromat = df_matches_pk_knownkompromat.append(result_valid_matches.DataFrame()) 

df_matches_pk_knownkompromat[["metaIdSplit1", "metaIdSplit2", "metaIdSplit3", "metaIdSplit4", "metaIdSplit5"]]= df_matches_pk_knownkompromat["metaId"].str.split("/", n = 4, expand = True)
df_matches_pk_knownkompromat = df_matches_pk_knownkompromat[["metaIdSplit2", "metaIdSplit3", "secret_sha256", "infos"]]

# This is what we filter als "invalid"
kompromatFilter = ["rfc", "softwaretests", "testvectors"]

In [None]:
# Get number of secrets in kompromat
##filtered (testkeys)
df_matches_pk_knownkompromat_unique_filtered = df_matches_pk_knownkompromat[["metaIdSplit2", "metaIdSplit3", "secret_sha256"]]
df_matches_pk_knownkompromat_unique_filtered = df_matches_pk_knownkompromat_unique_filtered[df_matches_pk_knownkompromat_unique_filtered["metaIdSplit2"].isin(kompromatFilter)]

df_matches_pk_knownkompromat_unique_grouped1_filtered = df_matches_pk_knownkompromat_unique_filtered.groupby("metaIdSplit2").nunique()
df_matches_pk_knownkompromat_unique_grouped2_filtered = df_matches_pk_knownkompromat_unique_filtered.groupby(["metaIdSplit2", "metaIdSplit3"]).nunique().reset_index(level=0)

df_matches_pk_knownkompromat_unique_grouped2_filtered = df_matches_pk_knownkompromat_unique_grouped2_filtered. \
    loc[df_matches_pk_knownkompromat_unique_grouped2_filtered.groupby(["metaIdSplit2"])["secret_sha256"].idxmax()]. \
        reset_index().set_index("metaIdSplit2").rename(columns={"metaIdSplit3": "service", "secret_sha256": "numdistinct"})

In [None]:
pkMaskParsable = ((df_matches["group"] == "private_key") & (df_matches["negativeFilter"].map(len) == 0))

df_matches_pk_knownkompromat_unique_filtered_fingerprints = df_matches_pk_knownkompromat_unique_filtered[df_matches_pk_knownkompromat_unique_filtered["metaIdSplit2"].isin(kompromatFilter)].drop_duplicates()["secret_sha256"]
df_matches.loc[pkMaskParsable, "negativeFilter"] += df_matches[pkMaskParsable].parallel_apply(lambda row: ["kompromat"] if row["secret_sha256"] in df_matches_pk_knownkompromat_unique_filtered_fingerprints.to_list() else [], axis=1)

In [None]:
filterRules = ["trufflehog_{}".format(x) for x in ["azure_old", "ibmclouduserkey", "digitaloceantoken", "gitlab", "currencycloud", "openuv", "netlify", "coinbase", "bitfenix", "accuweather", "wepay", "tomtom", "ticketmaster", "paymongo", "paymoapp", "paydirtapp", "facebookkey"] + ["howbadcanitgit_twitter"]]
df_matches["negativeFilter"] += df_matches.swifter.apply(lambda row: ["rule"] if row["rule"] in filterRules else [], axis=1)

# Filter all types

## Filter by filepath

### Define filepaths to be flagged

In [None]:
prefixes = ["usr\/lib", "var\/lib", "lib", "var\/cache", "usr\/local\/share", "usr\/share\/doc", "etc\/openvpn\/hidemyass", "etc\/openvpn\/vpnbook", "etc\/openvpn\/proxpn", "etc\/openvpn\/expressvpn", "etc\/openvpn\/purevpn", "etc\/openvpn\/freevpn", "etc\/openvpn\/elastictunnel", "etc\/openvpn\/anonvpn", "etc\/openvpn\/froot", "etc\/openvpn\/cactusvpn", "ovpn4", "vpngate\/config"]
prefixesAndTest = ["usr\/share\/java"]
contains = ["go\/pkg\/mod\/cache", "hideipvpn.com_", "\.gradle\/caches", "node\_modules", "\.gradle\/wrapper", "\.composer\/cache", "\.cache\/bazel", "lib\/ruby\/gems", "vendor\/bundle\/jruby\/.*\/gems", "azure\/cli\/.*\/tests", "php\/test\/ssh2\/tests", "google\-cloud\-sdk\/platform", "usr\/local\/lib", "flutter\/.*\/lib",  "flutter\/.*\/cache", "\.cargo\/registry", "flutter\/.*\/test", "ffead\-.*\/.*\/test", "elasticsearch\/plugins", "python.*\/.*-packages", "julia\/packages", "strongswan-.*\/testing", "\.cache\/pip", "\.cache\/helm", "\.cache\/yarn", "cache\/\.ivy2", "go\-dockerclient\/testing", "conda\/pkgs", "opkg\-lists", ".linuxbrew", ".bundle\/cache", "\.cache\/luarocks", "metasploit", "esp\-idf\/examples", "\.vim\/bundle", "\.vim\/plugged", "android\-sdk\/platform\-tools", "\.cpanm\/work", "nixpkgs\/pkgs", "swagger\-codegen\/modules", "maven\/ref", "nrfxlib\/openthread\/lib", "\.cargo\/registry\/src", "\.meteor\/package\-metadata", "\.npm\/_cacache", "android\/sdk\/emulator", "android\-sdk\-linux", "metadata\/md5\-cache", "rails\/bifrost\/log", "\.cache\/heroku\/yarn", "mssql\/data", "repo\/state\.cache", "android\-sdk\/emulator\/qemu\/linux", "kafka\/logs\/server\.log", "\.cache\/go\-build", "build\-helpers\/patches\/notes", "\.platformio\/packages"]
containsAndTest = ["org\.eclipse\.paho", "paho\.mqtt", "cassandra", "golang\.org\/x", "contrib"]
suffixes = ["RECORD", "packageinfo", "HEAD", "FETCH_HEAD", "\.yarn\-metadata\.json", "version\.txt", "zookeeper\.out", "VERSION\_BUILD\.json"] # ["\.svg", "md5sums", "versions", ]
filenameContains = [] #["manifest"]
filetypes = ["md", "mdf", "mfa", "maf", "fasta", "fastq", "fq", "fa", "seed", "ibd", "dbf", "db", "sto", "spdx\.json", "tga", "xwd", "blm\.lm", "ebuild", "fdt", "ttf", "jar\.pack", "sqlite", "sqlite3", "version", "yuv", "wt", "miff", "lm", "hrl", "cfs", "bsp", "avi", "bag"]

#### Filter

In [None]:
regexes = []
regexes.extend(["^(\/)*{}".format(p) for p in prefixes])
regexes.extend(["^(\/)*{}.*test".format(p) for p in prefixesAndTest])
regexes.extend(["{}".format(c) for c in contains])
regexes.extend(["{}.*test".format(c) for c in containsAndTest])
regexes.extend(["{}$".format(s) for s in suffixes])
regexes.extend(["{}[^\/]*$".format(f) for f in filenameContains])
regexes.extend(["{}$".format(f) for f in filetypes])
regexes.extend(["{}.gz$".format(f) for f in filetypes])

cRegex = re.compile('|'.join(regexes))
def filterPathRegex(row):
    for p in row["file_name"]:
        file_name = str(p)
        if cRegex.search(file_name):
            return ["filename"]
    return []

In [None]:
df_matches['negativeFilter'] += df_matches.parallel_apply(filterPathRegex, axis = 1)

In [None]:
def cleanFilePaths(fp, cleanprefixes):
    result = fp
    for cp in cleanprefixes:
        if fp.startswith(cp):
            result = result[len(cp):]
    return result

pkMask = (df_matches["group"] == "private_key")
df_matches_pk_with_filepaths_tmp = df_matches[pkMask].explode("file_name").reset_index(drop=True)
df_matches_pk_with_filepaths_tmp = df_matches_pk_with_filepaths_tmp[~df_matches_pk_with_filepaths_tmp["file_name"].isnull()]
df_matches_pk_with_filepaths_tmp["file_name"] = df_matches_pk_with_filepaths_tmp.parallel_apply(lambda row: cleanFilePaths(row["file_name"], ['./', '/']), axis=1)
df_matches_pk_with_filepaths_tmp["iskompromat"] = df_matches_pk_with_filepaths_tmp.parallel_apply(lambda row: True if "kompromat" in row["negativeFilter"] else False, axis=1)

levels = df_matches_pk_with_filepaths_tmp["file_name"].str.count('/').max()+1
splits = ["fileNameSplit{}".format(l) for l in range(1,levels+1)]
df_matches_pk_with_filepaths_tmp[splits] = df_matches_pk_with_filepaths_tmp["file_name"].str.split("/", n = levels-1, expand = True)
df_matches_pk_with_filepaths_tmp[splits] = df_matches_pk_with_filepaths_tmp[splits].fillna("")

splitplus = "fileNameSplit{}".format(len(splits)+1)

df_matches_pk_with_filepaths_tmp[splitplus] = ""
df_matches_pk_with_filepaths_tmp = df_matches_pk_with_filepaths_tmp.rename(columns={"secret_sha256":"numkompromat"})
df_matches_pk_with_filepaths_tmp = df_matches_pk_with_filepaths_tmp[splits + [splitplus] + ["iskompromat", "numkompromat"]]

In [None]:
def regeneratePaths(df, subfolders=False):
    if len(df) > 0:
        return [(p.rstrip('/'), subfolders) for p in df.reset_index().drop(["numkompromatTrue", "numkompromatFalse"], axis=1).agg('/'.join, axis=1).to_list()]
    else:
        return []

def generatePathsKompromat(df):
    result = []

    cols = list(df.columns)

    curdf = df
    curdf = curdf.groupby(cols[0:len(cols)-1]).nunique().unstack(level=-1, fill_value=0)
    curdf.columns = [f"{x}{y}" for x, y in curdf.columns.to_flat_index()]

    levels = list(curdf.index.names)
    
    curdf = curdf.iloc[curdf.index.get_level_values(len(levels)-2) != ""]

    firstocc = curdf[(curdf.index.get_level_values(len(levels)-1) == "")]
    kfirstocc = firstocc.groupby(level=levels[0:len(levels)-2]).sum()
    result.extend(regeneratePaths(kfirstocc[kfirstocc["numkompromatTrue"] > 0]))

    ongoingocc = curdf[(curdf.index.get_level_values(len(levels)-1) != "")]
    if len(ongoingocc) > 0:
        kongoingocc = ongoingocc.groupby(level=levels[0:len(levels)-2]).sum()
        paths = kongoingocc[0.5 * kongoingocc["numkompromatTrue"] > kongoingocc["numkompromatFalse"]]
        result.extend(regeneratePaths(paths, subfolders=True))

    if len(cols) > 7:
        cols = cols[0:len(cols)-3] + cols[len(cols)-2:]
        nexdf = df[cols]
        result.extend(generatePathsKompromat(nexdf))

    return result

filteredPaths = generatePathsKompromat(df_matches_pk_with_filepaths_tmp)


In [None]:
def checkPathsKompromat(row):
    res = []
    for fp in row["file_name"]:
        fpClean = fp.lstrip('/')
        for ffp in filteredPaths:
            if fpClean.startswith(ffp[0]):
                if ffp[1]:
                    res.append(ffp[0])
                else:
                    rest = fpClean[len(ffp[0])+1:]
                    if not '/' in rest:
                        res.append(ffp[0])
    return res

df_matches.loc[pkMask, "matchingfpkompromat"] = df_matches[pkMask].parallel_apply(checkPathsKompromat, axis=1)
df_matches.loc[pkMask, "negativeFilter"] += df_matches[pkMask].parallel_apply(lambda row: ["file_path"] if len(row["matchingfpkompromat"]) else [], axis=1)

In [None]:
df_matches.to_pickle('df_matches.pkl')

In [None]:
df_matches_filepathskompromat = df_matches[["matchingfpkompromat", "secret_sha256"]].explode("matchingfpkompromat").groupby("matchingfpkompromat").nunique()

In [None]:
df_most_filtered_paths = df_matches_filepathskompromat.sort_values(by="secret_sha256", ascending=False).head(10).reset_index()